diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..18ed278
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,742 @@
+name: Test
+
+on: [pull_request, merge_group]
+
+jobs:
+  test-ubuntu-24-04:
+    name: Ubuntu 24.04
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+
+    - uses: conda-incubator/setup-miniconda@v3
+      with:
+        auto-update-conda: true
+        conda-remove-defaults: true
+
+    - name: Install Toolchain
+      run: |
+        sudo apt-get update
+        sudo apt-get -y install {gcc,g++}-{9,10,11,12,13,14,mingw-w64-x86-64} clang-{14,15,16,17,18} ninja-build
+        conda create -y --name cuda-env
+        conda install -y --name cuda-env cuda-minimal-build
+        rm -f "$CONDA/envs/cuda-env/bin/ld"
+
+    - name: Build (GCC 9)
+      run: |
+        cd tests
+        mkdir build-gcc-9
+        cd build-gcc-9
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-9 -DCMAKE_CXX_COMPILER=g++-9 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (GCC 10)
+      run: |
+        cd tests
+        mkdir build-gcc-10
+        cd build-gcc-10
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_CXX_COMPILER=g++-10 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (GCC 11)
+      run: |
+        cd tests
+        mkdir build-gcc-11
+        cd build-gcc-11
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-11 -DCMAKE_CXX_COMPILER=g++-11 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (GCC 12)
+      run: |
+        cd tests
+        mkdir build-gcc-12
+        cd build-gcc-12
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (GCC 13)
+      run: |
+        cd tests
+        mkdir build-gcc-13
+        cd build-gcc-13
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-13 -DCMAKE_CXX_COMPILER=g++-13 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+
+    - name: Build (Clang 14)
+      run: |
+        cd tests
+        mkdir build-clang-14
+        cd build-clang-14
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-14 -DCMAKE_CXX_COMPILER=clang++-14 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 15)
+      run: |
+        cd tests
+        mkdir build-clang-15
+        cd build-clang-15
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-15 -DCMAKE_CXX_COMPILER=clang++-15 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 16)
+      run: |
+        cd tests
+        mkdir build-clang-16
+        cd build-clang-16
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-16 -DCMAKE_CXX_COMPILER=clang++-16 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 17)
+      run: |
+        cd tests
+        mkdir build-clang-17
+        cd build-clang-17
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-17 -DCMAKE_CXX_COMPILER=clang++-17 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+
+    - name: Build (MinGW-w64 GCC)
+      run: |
+        cd tests
+        mkdir build-mingw
+        cd build-mingw
+        cmake -G Ninja -DCMAKE_TOOLCHAIN_FILE=../mingw-w64-x86_64.cmake -DENABLE_CUDA=False ..
+        ninja
+
+    - name: Run (GCC 9)
+      run: |
+        cd tests/build-gcc-9
+        ctest --output-on-failure
+    - name: Run (GCC 10)
+      run: |
+        cd tests/build-gcc-10
+        ctest --output-on-failure
+    - name: Run (GCC 11)
+      run: |
+        cd tests/build-gcc-11
+        ctest --output-on-failure
+    - name: Run (GCC 12)
+      run: |
+        cd tests/build-gcc-12
+        ctest --output-on-failure
+    - name: Run (GCC 13)
+      run: |
+        cd tests/build-gcc-13
+        ctest --output-on-failure
+
+    - name: Run (Clang 14)
+      run: |
+        cd tests/build-clang-14
+        ctest --output-on-failure
+    - name: Run (Clang 15)
+      run: |
+        cd tests/build-clang-15
+        ctest --output-on-failure
+    - name: Run (Clang 16)
+      run: |
+        cd tests/build-clang-16
+        ctest --output-on-failure
+    - name: Run (Clang 17)
+      run: |
+        cd tests/build-clang-17
+        ctest --output-on-failure
+
+  test-ubuntu-22-04:
+    name: Ubuntu 22.04
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v4
+
+    - uses: conda-incubator/setup-miniconda@v3
+      with:
+        auto-update-conda: true
+        conda-remove-defaults: true
+
+    - name: Install Toolchain
+      run: |
+        sudo apt-get update
+        sudo apt-get -y install {gcc,g++}-{9,10,11,12,mingw-w64-x86-64} clang-{11,12,13,14,15} ninja-build
+        conda create -y --name cuda-env
+        conda install -y --name cuda-env cuda-minimal-build
+        rm -f "$CONDA/envs/cuda-env/bin/ld"
+
+    - name: Build (GCC 9)
+      run: |
+        cd tests
+        mkdir build-gcc-9
+        cd build-gcc-9
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-9 -DCMAKE_CXX_COMPILER=g++-9 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (GCC 10)
+      run: |
+        cd tests
+        mkdir build-gcc-10
+        cd build-gcc-10
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_CXX_COMPILER=g++-10 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (GCC 11)
+      run: |
+        cd tests
+        mkdir build-gcc-11
+        cd build-gcc-11
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-11 -DCMAKE_CXX_COMPILER=g++-11 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (GCC 12)
+      run: |
+        cd tests
+        mkdir build-gcc-12
+        cd build-gcc-12
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+
+    - name: Build (Clang 11)
+      run: |
+        cd tests
+        mkdir build-clang-11
+        cd build-clang-11
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-11 -DCMAKE_CXX_COMPILER=clang++-11 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 12)
+      run: |
+        cd tests
+        mkdir build-clang-12
+        cd build-clang-12
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-12 -DCMAKE_CXX_COMPILER=clang++-12 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 13)
+      run: |
+        cd tests
+        mkdir build-clang-13
+        cd build-clang-13
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-13 -DCMAKE_CXX_COMPILER=clang++-13 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 14)
+      run: |
+        cd tests
+        mkdir build-clang-14
+        cd build-clang-14
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-14 -DCMAKE_CXX_COMPILER=clang++-14 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 15)
+      run: |
+        cd tests
+        mkdir build-clang-15
+        cd build-clang-15
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-15 -DCMAKE_CXX_COMPILER=clang++-15 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+
+    - name: Build (MinGW-w64 GCC)
+      run: |
+        cd tests
+        mkdir build-mingw
+        cd build-mingw
+        cmake -G Ninja -DCMAKE_TOOLCHAIN_FILE=../mingw-w64-x86_64.cmake -DENABLE_CUDA=False ..
+        ninja
+
+    - name: Run (GCC 9)
+      run: |
+        cd tests/build-gcc-9
+        ctest --output-on-failure
+    - name: Run (GCC 10)
+      run: |
+        cd tests/build-gcc-10
+        ctest --output-on-failure
+    - name: Run (GCC 11)
+      run: |
+        cd tests/build-gcc-11
+        ctest --output-on-failure
+    - name: Run (GCC 12)
+      run: |
+        cd tests/build-gcc-12
+        ctest --output-on-failure
+
+    - name: Run (Clang 11)
+      run: |
+        cd tests/build-clang-11
+        ctest --output-on-failure
+    - name: Run (Clang 12)
+      run: |
+        cd tests/build-clang-12
+        ctest --output-on-failure
+    - name: Run (Clang 13)
+      run: |
+        cd tests/build-clang-13
+        ctest --output-on-failure
+    - name: Run (Clang 14)
+      run: |
+        cd tests/build-clang-14
+        ctest --output-on-failure
+    - name: Run (Clang 15)
+      run: |
+        cd tests/build-clang-15
+        ctest --output-on-failure
+
+  test-ubuntu-20-04:
+    name: Ubuntu 20.04
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v4
+
+    - uses: conda-incubator/setup-miniconda@v3
+      with:
+        auto-update-conda: true
+        conda-remove-defaults: true
+
+    - name: Install Toolchain
+      run: |
+        sudo apt-get update
+        sudo apt-get -y install {gcc,g++}-{7,8,9,10,mingw-w64-x86-64} clang-{7,8,9,10,11,12} ninja-build
+        conda create -y --name cuda-env
+        conda install -y --name cuda-env cuda-minimal-build
+        rm -f "$CONDA/envs/cuda-env/bin/ld"
+
+    - name: Build (GCC 7)
+      run: |
+        cd tests
+        mkdir build-gcc-7
+        cd build-gcc-7
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-7 -DCMAKE_CXX_COMPILER=g++-7 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (GCC 8)
+      run: |
+        cd tests
+        mkdir build-gcc-8
+        cd build-gcc-8
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-8 -DCMAKE_CXX_COMPILER=g++-8 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (GCC 9)
+      run: |
+        cd tests
+        mkdir build-gcc-9
+        cd build-gcc-9
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-9 -DCMAKE_CXX_COMPILER=g++-9 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (GCC 10)
+      run: |
+        cd tests
+        mkdir build-gcc-10
+        cd build-gcc-10
+        cmake -G Ninja -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_CXX_COMPILER=g++-10 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+
+    - name: Build (Clang 7)
+      run: |
+        cd tests
+        mkdir build-clang-7
+        cd build-clang-7
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-7 -DCMAKE_CXX_COMPILER=clang++-7 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 8)
+      run: |
+        cd tests
+        mkdir build-clang-8
+        cd build-clang-8
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-8 -DCMAKE_CXX_COMPILER=clang++-8 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 9)
+      run: |
+        cd tests
+        mkdir build-clang-9
+        cd build-clang-9
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-9 -DCMAKE_CXX_COMPILER=clang++-9 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 10)
+      run: |
+        cd tests
+        mkdir build-clang-10
+        cd build-clang-10
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-10 -DCMAKE_CXX_COMPILER=clang++-10 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 11)
+      run: |
+        cd tests
+        mkdir build-clang-11
+        cd build-clang-11
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-11 -DCMAKE_CXX_COMPILER=clang++-11 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+    - name: Build (Clang 12)
+      run: |
+        cd tests
+        mkdir build-clang-12
+        cd build-clang-12
+        cmake -G Ninja -DCMAKE_C_COMPILER=clang-12 -DCMAKE_CXX_COMPILER=clang++-12 -DCMAKE_CUDA_COMPILER="$CONDA/envs/cuda-env/bin/nvcc" ..
+        ninja
+
+    - name: Build (MinGW-w64 GCC)
+      run: |
+        cd tests
+        mkdir build-mingw
+        cd build-mingw
+        cmake -G Ninja -DCMAKE_TOOLCHAIN_FILE=../mingw-w64-x86_64.cmake -DENABLE_CUDA=False ..
+        ninja
+
+    - name: Run (GCC 7)
+      run: |
+        cd tests/build-gcc-7
+        ctest --output-on-failure
+    - name: Run (GCC 8)
+      run: |
+        cd tests/build-gcc-8
+        ctest --output-on-failure
+    - name: Run (GCC 9)
+      run: |
+        cd tests/build-gcc-9
+        ctest --output-on-failure
+    - name: Run (GCC 10)
+      run: |
+        cd tests/build-gcc-10
+        ctest --output-on-failure
+
+    - name: Run (Clang 7)
+      run: |
+        cd tests/build-clang-7
+        ctest --output-on-failure
+    - name: Run (Clang 8)
+      run: |
+        cd tests/build-clang-8
+        ctest --output-on-failure
+    - name: Run (Clang 9)
+      run: |
+        cd tests/build-clang-9
+        ctest --output-on-failure
+    - name: Run (Clang 10)
+      run: |
+        cd tests/build-clang-10
+        ctest --output-on-failure
+    - name: Run (Clang 11)
+      run: |
+        cd tests/build-clang-11
+        ctest --output-on-failure
+    - name: Run (Clang 12)
+      run: |
+        cd tests/build-clang-12
+        ctest --output-on-failure
+
+  test-windows-2022-a:
+    name: Windows Server 2022 (A)
+    runs-on: windows-2022
+    steps:
+    - uses: actions/checkout@v4
+
+    - uses: conda-incubator/setup-miniconda@v3
+      with:
+        auto-update-conda: true
+        conda-remove-defaults: true
+
+    - name: Conda - Create Environment
+      shell: cmd
+      run: |
+        conda create -y --name cuda-env
+
+    - name: Conda - Install CUDA Toolchain
+      shell: cmd
+      run: |
+        conda install -y --name cuda-env cuda-minimal-build
+
+    - name: Build (VS 2022)
+      shell: cmd
+      run: |
+        cd tests
+        mkdir build-vs
+        cd build-vs
+        "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" && "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe" -G Ninja -DCMAKE_CUDA_COMPILER="%CONDA:\=/%/envs/cuda-env/Library/bin/nvcc.exe" .. && ninja
+
+    - name: Run (VS 2022)
+      shell: cmd
+      run: |
+        cd tests/build-vs
+        "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\ctest.exe" --output-on-failure
+
+  test-windows-2022-b:
+    name: Windows Server 2022 (B)
+    runs-on: windows-2022
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Build (VS 2022 x86)
+      shell: cmd
+      run: |
+        cd tests
+        mkdir build-vs-x86
+        cd build-vs-x86
+        "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsamd64_x86.bat" && "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe" -G Ninja -DENABLE_CUDA=False .. && ninja
+
+    - name: Build (VS 2022 Arm64)
+      shell: cmd
+      run: |
+        cd tests
+        mkdir build-vs-arm64
+        cd build-vs-arm64
+        "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsamd64_arm64.bat" && "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe" -G Ninja -DENABLE_CUDA=False .. && ninja
+
+    - name: Build (Clang-CL)
+      shell: cmd
+      run: |
+        cd tests
+        mkdir build-clang-cl
+        cd build-clang-cl
+        "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" && "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe" -G Ninja -DCMAKE_C_COMPILER="%ProgramFiles:\=/%/Microsoft Visual Studio/2022/Enterprise/VC/Tools/Llvm/x64/bin/clang-cl.exe" -DCMAKE_CXX_COMPILER="%ProgramFiles:\=/%/Microsoft Visual Studio/2022/Enterprise/VC/Tools/Llvm/x64/bin/clang-cl.exe" -DCMAKE_LINKER="%ProgramFiles:\=/%/Microsoft Visual Studio/2022/Enterprise/VC/Tools/Llvm/x64/bin/lld-link.exe" -DENABLE_CUDA=False .. && ninja
+
+    - name: Run (VS 2022 x86)
+      shell: cmd
+      run: |
+        cd tests/build-vs-x86
+        "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\ctest.exe" --output-on-failure
+
+    - name: Run (Clang-CL)
+      shell: cmd
+      run: |
+        cd tests/build-clang-cl
+        "%ProgramFiles%\Microsoft Visual Studio\2022\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\ctest.exe" --output-on-failure
+
+  test-windows-2019-a:
+    name: Windows Server 2019 (A)
+    runs-on: windows-2019
+    steps:
+    - uses: actions/checkout@v4
+
+    - uses: conda-incubator/setup-miniconda@v3
+      with:
+        auto-update-conda: true
+        conda-remove-defaults: true
+
+    - name: Conda - Create Environment
+      shell: cmd
+      run: |
+        conda create -y --name cuda-env
+
+    - name: Conda - Install CUDA Toolchain
+      shell: cmd
+      run: |
+        conda install -y --name cuda-env cuda-minimal-build
+
+    - name: Build (VS 2019)
+      shell: cmd
+      run: |
+        cd tests
+        mkdir build-vs
+        cd build-vs
+        "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" && "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe" -G Ninja -DCMAKE_CUDA_COMPILER="%CONDA:\=/%/envs/cuda-env/Library/bin/nvcc.exe" .. && ninja
+
+    - name: Run (VS 2019)
+      shell: cmd
+      run: |
+        cd tests/build-vs
+        "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\ctest.exe" --output-on-failure
+
+  test-windows-2019-b:
+    name: Windows Server 2019 (B)
+    runs-on: windows-2019
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Build (VS 2019 x86)
+      shell: cmd
+      run: |
+        cd tests
+        mkdir build-vs-x86
+        cd build-vs-x86
+        "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsamd64_x86.bat" && "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe" -G Ninja -DENABLE_CUDA=False .. && ninja
+
+    - name: Build (VS 2019 Arm64)
+      shell: cmd
+      run: |
+        cd tests
+        mkdir build-vs-arm64
+        cd build-vs-arm64
+        "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsamd64_arm64.bat" && "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe" -G Ninja -DENABLE_CUDA=False .. && ninja
+
+    - name: Build (Clang-CL)
+      shell: cmd
+      run: |
+        cd tests
+        mkdir build-clang-cl
+        cd build-clang-cl
+        "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" && "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\cmake.exe" -G Ninja -DCMAKE_C_COMPILER="%ProgramFiles(x86):\=/%/Microsoft Visual Studio/2019/Enterprise/VC/Tools/Llvm/x64/bin/clang-cl.exe" -DCMAKE_CXX_COMPILER="%ProgramFiles(x86):\=/%/Microsoft Visual Studio/2019/Enterprise/VC/Tools/Llvm/x64/bin/clang-cl.exe" -DCMAKE_LINKER="%ProgramFiles(x86):\=/%/Microsoft Visual Studio/2019/Enterprise/VC/Tools/Llvm/x64/bin/lld-link.exe" -DENABLE_CUDA=False .. && ninja
+
+    - name: Run (VS 2019 x86)
+      shell: cmd
+      run: |
+        cd tests/build-vs-x86
+        "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\ctest.exe" --output-on-failure
+
+    - name: Run (Clang-CL)
+      shell: cmd
+      run: |
+        cd tests/build-clang-cl
+        "%ProgramFiles(x86)%\Microsoft Visual Studio\2019\Enterprise\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin\ctest.exe" --output-on-failure
+
+  test-macos-15:
+    name: macOS 15
+    runs-on: macos-15
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install Toolchain
+      run: |
+        brew install --force-bottle ninja llvm lld mingw-w64
+
+    - name: Build (Apple Clang)
+      run: |
+        cd tests
+        mkdir build-appleclang
+        cd build-appleclang
+        cmake -G Ninja ..
+        ninja
+
+    - name: Build (GCC)
+      run: |
+        cd tests
+        mkdir build-gcc
+        cd build-gcc
+        GCC_VERSION="$(brew list --versions gcc | cut '-d ' -f2 | cut '-d.' -f1)"
+        cmake -G Ninja -DCMAKE_C_COMPILER="gcc-$GCC_VERSION" -DCMAKE_CXX_COMPILER="g++-$GCC_VERSION" ..
+        ninja
+
+    - name: Build (Clang)
+      run: |
+        cd tests
+        mkdir build-clang
+        cd build-clang
+        export PATH="/opt/homebrew/opt/llvm/bin:$PATH"
+        export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+        export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+        cmake -G Ninja -DCMAKE_C_COMPILER="/opt/homebrew/opt/llvm/bin/clang" -DCMAKE_CXX_COMPILER="/opt/homebrew/opt/llvm/bin/clang++" -DCMAKE_LINKER="ld64.lld" ..
+        ninja
+
+    - name: Build (MinGW-w64 GCC)
+      run: |
+        cd tests
+        mkdir build-mingw
+        cd build-mingw
+        cmake -G Ninja -DCMAKE_TOOLCHAIN_FILE=../mingw-w64-x86_64.cmake -DENABLE_CUDA=False ..
+        ninja
+
+    - name: Run (Apple Clang)
+      run: |
+        cd tests/build-appleclang
+        ctest --output-on-failure
+
+    - name: Run (GCC)
+      run: |
+        cd tests/build-gcc
+        ctest --output-on-failure
+
+    - name: Run (Clang)
+      run: |
+        cd tests/build-clang
+        ctest --output-on-failure
+
+  test-macos-14:
+    name: macOS 14
+    runs-on: macos-14
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install Toolchain
+      run: |
+        brew install --force-bottle ninja llvm lld mingw-w64
+
+    - name: Build (Apple Clang)
+      run: |
+        cd tests
+        mkdir build-appleclang
+        cd build-appleclang
+        cmake -G Ninja ..
+        ninja
+
+    - name: Build (GCC)
+      run: |
+        cd tests
+        mkdir build-gcc
+        cd build-gcc
+        GCC_VERSION="$(brew list --versions gcc | cut '-d ' -f2 | cut '-d.' -f1)"
+        cmake -G Ninja -DCMAKE_C_COMPILER="gcc-$GCC_VERSION" -DCMAKE_CXX_COMPILER="g++-$GCC_VERSION" ..
+        ninja
+
+    - name: Build (Clang)
+      run: |
+        cd tests
+        mkdir build-clang
+        cd build-clang
+        export PATH="/opt/homebrew/opt/llvm/bin:$PATH"
+        export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+        export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+        cmake -G Ninja -DCMAKE_C_COMPILER="/opt/homebrew/opt/llvm/bin/clang" -DCMAKE_CXX_COMPILER="/opt/homebrew/opt/llvm/bin/clang++" -DCMAKE_LINKER="ld64.lld" ..
+        ninja
+
+    - name: Build (MinGW-w64 GCC)
+      run: |
+        cd tests
+        mkdir build-mingw
+        cd build-mingw
+        cmake -G Ninja -DCMAKE_TOOLCHAIN_FILE=../mingw-w64-x86_64.cmake -DENABLE_CUDA=False ..
+        ninja
+
+    - name: Run (Apple Clang)
+      run: |
+        cd tests/build-appleclang
+        ctest --output-on-failure
+
+    - name: Run (GCC)
+      run: |
+        cd tests/build-gcc
+        ctest --output-on-failure
+
+    - name: Run (Clang)
+      run: |
+        cd tests/build-clang
+        ctest --output-on-failure
+
+  test-macos-13:
+    name: macOS 13
+    runs-on: macos-13
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Install Toolchain
+      run: |
+        brew install --force-bottle ninja llvm lld mingw-w64
+
+    - name: Build (Apple Clang)
+      run: |
+        cd tests
+        mkdir build-appleclang
+        cd build-appleclang
+        cmake -G Ninja ..
+        ninja
+
+    - name: Build (GCC)
+      run: |
+        cd tests
+        mkdir build-gcc
+        cd build-gcc
+        GCC_VERSION="$(brew list --versions gcc | cut '-d ' -f2 | cut '-d.' -f1)"
+        cmake -G Ninja -DCMAKE_C_COMPILER="gcc-$GCC_VERSION" -DCMAKE_CXX_COMPILER="g++-$GCC_VERSION" ..
+        ninja
+
+    - name: Build (Clang)
+      run: |
+        cd tests
+        mkdir build-clang
+        cd build-clang
+        export PATH="/usr/local/opt/llvm/bin:$PATH"
+        export LDFLAGS="-L/usr/local/opt/llvm/lib"
+        export CPPFLAGS="-I/usr/local/opt/llvm/include"
+        cmake -G Ninja -DCMAKE_C_COMPILER="/usr/local/opt/llvm/bin/clang" -DCMAKE_CXX_COMPILER="/usr/local/opt/llvm/bin/clang++" -DCMAKE_LINKER="ld64.lld" ..
+        ninja
+
+    - name: Build (MinGW-w64 GCC)
+      run: |
+        cd tests
+        mkdir build-mingw
+        cd build-mingw
+        cmake -G Ninja -DCMAKE_TOOLCHAIN_FILE=../mingw-w64-x86_64.cmake -DENABLE_CUDA=False ..
+        ninja
+
+    - name: Run (Apple Clang)
+      run: |
+        cd tests/build-appleclang
+        ctest --output-on-failure
+
+    - name: Run (GCC)
+      run: |
+        cd tests/build-gcc
+        ctest --output-on-failure
+
+    - name: Run (Clang)
+      run: |
+        cd tests/build-clang
+        ctest --output-on-failure
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 0000000..0ec9e7f
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1 @@
+/build*
diff --git a/tests/Attributes.cpp b/tests/Attributes.cpp
new file mode 100644
index 0000000..651c8c0
--- /dev/null
+++ b/tests/Attributes.cpp
@@ -0,0 +1,224 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <nvtx3/nvtx3.hpp>
+// Include again to catch bad guards
+#include <nvtx3/nvtx3.hpp>
+
+#include <iostream>
+#include <string>
+
+#include "PrettyPrintersNvtxCpp.h"
+
+struct a_lib
+{
+    static constexpr const char* name{"Library A"};
+};
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    (void)argc;
+    (void)argv;
+
+    {
+        std::cout << "Default attributes:\n";
+        nvtx3::event_attributes attr;
+        std::cout << attr;
+    }
+    std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a payload:\n";
+        nvtx3::event_attributes attr{nvtx3::payload{5.0f}};
+        std::cout << attr;
+    }
+    std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a color with RGB hex code 0xFF7F00:\n";
+        nvtx3::event_attributes attr{nvtx3::color{0xFFFF7F00}};
+        std::cout << attr;
+    }
+    std::cout << "-------------------------------------\n";
+
+
+    {
+        std::cout << "Set a color with RGB=255,127,0:\n";
+        nvtx3::event_attributes attr{nvtx3::rgb{255,127,0}};
+        std::cout << attr;
+    }
+    std::cout << "-------------------------------------\n";
+
+
+    {
+        std::cout << "Set a color & payload:\n";
+        nvtx3::event_attributes attr{nvtx3::rgb{255,127,0}, nvtx3::payload{5.0f}};
+        std::cout << attr;
+    }
+    std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a color (red), payload, color again (green)... first color wins:\n";
+
+        nvtx3::event_attributes attr{
+            nvtx3::rgb{255,0,0},
+            nvtx3::payload{5.0f},
+            nvtx3::rgb{0, 255, 0}};
+
+        std::cout << attr;
+    }
+    std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a message (ascii), payload, color, and category:\n";
+
+        nvtx3::event_attributes attr{
+            nvtx3::message{"Hello"},
+            nvtx3::category{11},
+            nvtx3::payload{5.0f},
+            nvtx3::rgb{0,255,0}};
+
+        std::cout << attr;
+    }
+    std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a message with different string types:\n";
+
+        nvtx3::event_attributes a{nvtx3::message{"Hello"}};
+        std::cout << a;
+
+        nvtx3::event_attributes wa{nvtx3::message{L"Hello"}};
+        std::cout << wa;
+
+        std::string hello{"Hello"};
+        nvtx3::event_attributes b{nvtx3::message{hello}};
+        std::cout << b;
+
+        std::wstring whello{L"Hello"};
+        nvtx3::event_attributes wb{nvtx3::message{whello}};
+        std::cout << wb;
+
+        // Important!  Neither of following will compile:
+        //
+        //   nvtx3::event_attributes c{nvtx3::message{std::string{"foo"}}};
+        //   std::cout << c;
+        //
+        //   std::string foo{"foo"};
+        //   nvtx3::event_attributes d{nvtx3::message{hello + "bar"}};
+        //   std::cout << d;
+        //
+        // Both of those usages fail with:
+        // "error C2280: 'nvtx3::message::message(std::string &&)':
+        //  attempting to reference a deleted function"
+        //
+        // nvtx3::message is a "view" class, not an owning class.
+        // It cannot take ownership of a temporary string and
+        // destroy it when it goes out of scope.  Similarly,
+        // nvtx3::event_attributes is not an owning class, so it cannot take
+        // ownership of an nvtx3::message either.
+        //
+        // TODO:  Could we add implicit support for this?
+    }
+    std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a message (registered):\n";
+        auto hTacobell = reinterpret_cast<nvtxStringHandle_t>(0x7ac0be11);
+        nvtx3::event_attributes attr{nvtx3::message{hTacobell}};
+        std::cout << attr;
+    }
+    std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set category/message/payload/color, with \"using\":\n";
+
+        using namespace nvtx3;
+
+        event_attributes a{
+            category{11},
+            message{"Hello"},
+            payload{5.0f},
+            rgb{1,2,3}};
+
+        std::cout << a;
+    }
+    std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Convenience: Set a message without the helper type:\n";
+
+        nvtx3::event_attributes a{"Hello"};
+        std::cout << a;
+
+        std::string hello{"Hello"};
+        nvtx3::event_attributes b{hello};
+        std::cout << b;
+    }
+    std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Examples: \"using\", skip helper type for msg, set other fields:\n";
+
+        using namespace nvtx3;
+
+        event_attributes a{"Hello", payload{7.0}};
+        std::cout << a;
+
+        event_attributes b{"Hello", rgb{255,255,0}};
+        std::cout << b;
+
+        event_attributes c{"Hello", category{4}};
+        std::cout << c;
+
+        // Order doesn't matter
+        event_attributes d{"Hello", rgb{255,255,0}, payload{7.0}, category{4}};
+        std::cout << d;
+
+        event_attributes e{payload{7.0}, "Hello", category{4}, rgb{255,255,0}};
+        std::cout << e;
+
+        event_attributes f{category{4}, rgb{255,255,0}, payload{7.0}, "Hello"};
+        std::cout << f;
+
+        // Vertical formatting is nice too:
+        event_attributes g{
+            "Hello",
+            category{4},
+            rgb{255,255,0},
+            payload{7.0}};
+        std::cout << g;
+
+        event_attributes h
+        {
+            "Hello",
+            category{4},
+            rgb{255,255,0},
+            payload{7.0}
+        };
+        std::cout << h;
+    }
+    std::cout << "-------------------------------------\n";
+
+    return 0;
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..c246024
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,394 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+
+cmake_minimum_required (VERSION 3.19)
+
+if(NOT DEFINED ENABLE_CUDA)
+    set(ENABLE_CUDA True)
+endif()
+if(APPLE)
+    set(ENABLE_CUDA False)
+endif()
+
+set(NVTX_LANGUAGES C CXX)
+if(ENABLE_CUDA)
+    set(NVTX_LANGUAGES ${NVTX_LANGUAGES} CUDA)
+endif()
+
+project ("NvtxTests" LANGUAGES ${NVTX_LANGUAGES})
+
+# Enforce standard C/C++ with sensible warnings and minimal compiler output on all platforms
+set(CMAKE_C_STANDARD 90)
+set(CMAKE_C_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin \"${CMAKE_CXX_COMPILER}\"")
+if(MSVC)
+    # Must use - instead of / for option prefix when using NVCC, because it forwards args
+    # it doesn't know to the host compiler, but being a UNIX-heritage program, it thinks
+    # an argument like "/nologo" is an input file path.  Luckily, MSVC accepts - prefixes.
+    if(CMAKE_C_COMPILER_VERSION VERSION_LESS "19.14.0.0")
+        # Enable options to behave closer to standard
+    else()
+        add_compile_options(-permissive-)
+    endif()
+    # The following line can be uncommented to test with WIN32_LEAN_AND_MEAN
+    #add_compile_definitions(WIN32_LEAN_AND_MEAN)
+endif()
+
+# Build with minimal or no dependencies on installed C/C++ runtime libraries
+if(MSVC)
+    # For Non-debug, change /MD (MultiThreadedDLL) to /MT (MultiThreaded)
+    # For Debug, change /MDd (MultiThreadedDebugDLL) to /MTd ((MultiThreadedDebug)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+else()
+    # Statically link libstdc++ and libgcc.  Do not statically link libc, though.
+    # Use an old sysroot if compatibility with old GLIBC versions is required.
+    # In non-DEBUG builds, use `-s` (or `-x -S`) to strip unneeded symbols
+    add_link_options(
+        $<$<CXX_COMPILER_ID:GNU>:-static-libstdc++>
+        $<$<CXX_COMPILER_ID:GNU>:-static-libgcc>
+        $<$<AND:$<CONFIG:Release,MinSizeRel>,$<PLATFORM_ID:Darwin>>:-Wl,-x,-S>
+        $<$<AND:$<CONFIG:Release,MinSizeRel>,$<NOT:$<PLATFORM_ID:Darwin>>>:-Wl,-s>
+    )
+endif()
+
+# Compiler-specific idiosyncracies
+if(MSVC)
+    # Must use - instead of / for option prefix when using NVCC, because it forwards args
+    # it doesn't know to the host compiler, but being a UNIX-heritage program, it thinks
+    # an argument like "/nologo" is an input file path.  Luckily, MSVC accepts - prefixes.
+    add_compile_options(-nologo)
+    #add_compile_options(-wd26812) # Disable warning: prefer enum class over unscoped enum
+    add_link_options(-NOLOGO -INCREMENTAL:NO)
+    # On some platforms, CMake doesn't automatically add C++ flags to enable RTTI (/GR) or
+    # configure C++ exceptions to the commonly preferred value (/EHsc or /GX).  Add these
+    # if they are missing.
+    if(NOT CMAKE_CXX_FLAGS MATCHES "(/|-)GR( |$)")
+        string(APPEND CMAKE_CXX_FLAGS " -GR")
+    endif()
+    if(NOT CMAKE_CXX_FLAGS MATCHES "(/|-)(EHsc|GX)( |$)")
+        string(APPEND CMAKE_CXX_FLAGS " -EHsc")
+    endif()
+    # Improve debugging
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+        # This for some reason also adds "MDd" even though above we asked for MTd,
+        # so add the /JMC option manually
+        #set(CMAKE_VS_JUST_MY_CODE_DEBUGGING ON)
+        add_compile_options(-JMC)
+    endif()
+else()
+    # Stop compiling immediately after first error
+    add_compile_options(-Wfatal-errors)
+    # Check for initializing unions without required braces
+    add_compile_options(-Wmissing-braces)
+endif()
+
+
+add_subdirectory("../c" "ImportNvtx")
+
+#if(DOMAINS_ERROR_TEST_NAME_IS_MISSING)
+#    target_compile_definitions(domains PRIVATE ERROR_TEST_NAME_IS_MISSING)
+#endif()
+
+add_executable(runtest "RunTest.cpp")
+target_link_libraries(runtest PRIVATE nvtx3-cpp)
+
+add_library(inj-c SHARED "PrintInjectionC.c")
+target_compile_definitions(inj-c PRIVATE SUPPORT_STATIC_INJECTION SUPPORT_DYNAMIC_INJECTION SUPPORT_PREINJECTION INJECTION_PRINT_PREFIX="inj-c")
+target_link_libraries(inj-c PRIVATE nvtx3-c)
+set_target_properties(inj-c PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(inj-cpp SHARED "PrintInjectionCpp.cpp")
+target_compile_definitions(inj-cpp PRIVATE SUPPORT_STATIC_INJECTION SUPPORT_DYNAMIC_INJECTION SUPPORT_PREINJECTION INJECTION_PRINT_PREFIX="inj-cpp")
+target_link_libraries(inj-cpp PRIVATE nvtx3-cpp)
+set_target_properties(inj-cpp PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(inj-c-static STATIC "PrintInjectionC.c")
+if(NOT MSVC)
+    target_compile_options(inj-c-static PRIVATE -fPIC)
+endif()
+target_compile_definitions(inj-c-static PRIVATE SUPPORT_STATIC_INJECTION INJECTION_PRINT_PREFIX="inj-c-static")
+target_link_libraries(inj-c-static PRIVATE nvtx3-c)
+set_target_properties(inj-c-static PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(inj-cpp-static STATIC "PrintInjectionCpp.cpp")
+if(NOT MSVC)
+    target_compile_options(inj-cpp-static PRIVATE -fPIC)
+endif()
+target_compile_definitions(inj-cpp-static PRIVATE SUPPORT_STATIC_INJECTION INJECTION_PRINT_PREFIX="inj-cpp-static")
+target_link_libraries(inj-cpp-static PRIVATE nvtx3-cpp)
+set_target_properties(inj-cpp-static PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(inj-c-dynamic SHARED "PrintInjectionC.c")
+target_compile_definitions(inj-c-dynamic PRIVATE SUPPORT_DYNAMIC_INJECTION INJECTION_PRINT_PREFIX="inj-c-dynamic")
+target_link_libraries(inj-c-dynamic PRIVATE nvtx3-c)
+set_target_properties(inj-c-dynamic PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(inj-cpp-dynamic SHARED "PrintInjectionCpp.cpp")
+target_compile_definitions(inj-cpp-dynamic PRIVATE SUPPORT_DYNAMIC_INJECTION INJECTION_PRINT_PREFIX="inj-cpp-dynamic")
+target_link_libraries(inj-cpp-dynamic PRIVATE nvtx3-cpp)
+set_target_properties(inj-cpp-dynamic PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(inj-c-preinject SHARED "PrintInjectionC.c")
+target_compile_definitions(inj-c-preinject PRIVATE SUPPORT_PREINJECTION INJECTION_PRINT_PREFIX="inj-c-preinject")
+target_link_libraries(inj-c-preinject PRIVATE nvtx3-c)
+set_target_properties(inj-c-preinject PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(inj-cpp-preinject SHARED "PrintInjectionCpp.cpp")
+target_compile_definitions(inj-cpp-preinject PRIVATE SUPPORT_PREINJECTION INJECTION_PRINT_PREFIX="inj-cpp-preinject")
+target_link_libraries(inj-cpp-preinject PRIVATE nvtx3-cpp)
+set_target_properties(inj-cpp-preinject PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(self SHARED "TestSelfInjection.cpp" "SelfInjection.cpp")
+target_link_libraries(self PRIVATE nvtx3-cpp)
+set_target_properties(self PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(calls SHARED "Calls.cpp" "SelfInjection.cpp")
+target_link_libraries(calls PRIVATE nvtx3-cpp)
+set_target_properties(calls PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(coverage-c SHARED "CoverageC.c")
+target_link_libraries(coverage-c PRIVATE nvtx3-c)
+set_target_properties(coverage-c PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+if(NOT MSVC)
+    add_library(coverage-c-static-obj-c SHARED "CoverageC.c" "PrintInjectionC.c")
+    target_compile_definitions(coverage-c-static-obj-c PRIVATE SUPPORT_STATIC_INJECTION INJECTION_PRINT_PREFIX="inj-c-static")
+    target_link_libraries(coverage-c-static-obj-c PRIVATE nvtx3-c)
+    set_target_properties(coverage-c-static-obj-c PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+    add_library(coverage-c-static-obj-cpp SHARED "CoverageC.c" "PrintInjectionCpp.cpp")
+    target_compile_definitions(coverage-c-static-obj-cpp PRIVATE SUPPORT_STATIC_INJECTION INJECTION_PRINT_PREFIX="inj-cpp-static")
+    target_link_libraries(coverage-c-static-obj-cpp PRIVATE nvtx3-c)
+    set_target_properties(coverage-c-static-obj-cpp PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+    if(NOT APPLE)
+        add_library(coverage-c-static-lib-c SHARED "CoverageC.c")
+        target_link_libraries(coverage-c-static-lib-c PRIVATE nvtx3-c inj-c-static)
+        target_link_libraries(coverage-c-static-lib-c PRIVATE -Wl,--whole-archive inj-c-static -Wl,--no-whole-archive)
+        set_target_properties(coverage-c-static-lib-c PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+        add_library(coverage-c-static-lib-cpp SHARED "CoverageC.c")
+        target_link_libraries(coverage-c-static-lib-cpp PRIVATE nvtx3-c inj-cpp-static)
+        target_link_libraries(coverage-c-static-lib-cpp PRIVATE -Wl,--whole-archive inj-cpp-static -Wl,--no-whole-archive)
+        set_target_properties(coverage-c-static-lib-cpp PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+    endif()
+endif()
+
+add_library(coverage-c-preinject SHARED "CoverageC.c")
+target_compile_definitions(coverage-c-preinject PRIVATE NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY=1)
+target_link_libraries(coverage-c-preinject PRIVATE nvtx3-c)
+set_target_properties(coverage-c-preinject PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(coverage-cpp SHARED "CoverageCpp.cpp")
+target_link_libraries(coverage-cpp PRIVATE nvtx3-cpp)
+set_target_properties(coverage-cpp PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+if(NOT MSVC)
+    add_library(coverage-cpp-static-obj-c SHARED "CoverageCpp.cpp" "PrintInjectionC.c")
+    target_compile_definitions(coverage-cpp-static-obj-c PRIVATE SUPPORT_STATIC_INJECTION INJECTION_PRINT_PREFIX="inj-c-static")
+    target_link_libraries(coverage-cpp-static-obj-c PRIVATE nvtx3-cpp)
+    set_target_properties(coverage-cpp-static-obj-c PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+    add_library(coverage-cpp-static-obj-cpp SHARED "CoverageCpp.cpp" "PrintInjectionCpp.cpp")
+    target_compile_definitions(coverage-cpp-static-obj-cpp PRIVATE SUPPORT_STATIC_INJECTION INJECTION_PRINT_PREFIX="inj-cpp-static")
+    target_link_libraries(coverage-cpp-static-obj-cpp PRIVATE nvtx3-cpp)
+    set_target_properties(coverage-cpp-static-obj-cpp PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+    if(NOT APPLE)
+        add_library(coverage-cpp-static-lib-c SHARED "CoverageCpp.cpp")
+        target_link_libraries(coverage-cpp-static-lib-c PRIVATE nvtx3-cpp)
+        target_link_libraries(coverage-cpp-static-lib-c PRIVATE -Wl,--whole-archive inj-c-static -Wl,--no-whole-archive)
+        set_target_properties(coverage-cpp-static-lib-c PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+        add_library(coverage-cpp-static-lib-cpp SHARED "CoverageCpp.cpp")
+        target_link_libraries(coverage-cpp-static-lib-cpp PRIVATE nvtx3-cpp)
+        target_link_libraries(coverage-cpp-static-lib-cpp PRIVATE -Wl,--whole-archive inj-cpp-static -Wl,--no-whole-archive)
+        set_target_properties(coverage-cpp-static-lib-cpp PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+    endif()
+endif()
+
+add_library(coverage-cpp-preinject SHARED "CoverageCpp.cpp")
+target_compile_definitions(coverage-cpp-preinject PRIVATE NVTX_SUPPORT_ALREADY_INJECTED_LIBRARY=1)
+target_link_libraries(coverage-cpp-preinject PRIVATE nvtx3-cpp)
+set_target_properties(coverage-cpp-preinject PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+if(ENABLE_CUDA)
+    add_library(coverage-cu SHARED "CoverageCuda.cu")
+    target_link_libraries(coverage-cu PRIVATE nvtx3-cpp)
+    set_target_properties(coverage-cu PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+endif()
+
+add_library(coverage-ext-mem SHARED "CoverageExtMem.c")
+target_link_libraries(coverage-ext-mem PRIVATE nvtx3-c)
+set_target_properties(coverage-ext-mem PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+if(ENABLE_CUDA)
+    add_library(coverage-ext-memcudart SHARED "CoverageExtMemCudaRt.cu")
+    target_link_libraries(coverage-ext-memcudart PRIVATE nvtx3-c)
+    set_target_properties(coverage-ext-memcudart PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+endif()
+
+add_library(coverage-ext-payload SHARED "CoverageExtPayload.c")
+target_link_libraries(coverage-ext-payload PRIVATE nvtx3-c)
+set_target_properties(coverage-ext-payload PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(coverage-ext-counter SHARED "CoverageExtCounter.c")
+target_link_libraries(coverage-ext-counter PRIVATE nvtx3-c)
+set_target_properties(coverage-ext-counter PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(attributes SHARED "Attributes.cpp")
+target_link_libraries(attributes PRIVATE nvtx3-cpp)
+set_target_properties(attributes PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(domains SHARED "Domains.cpp")
+target_link_libraries(domains PRIVATE nvtx3-cpp)
+set_target_properties(domains PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(categories SHARED "NamedCategories.cpp")
+target_link_libraries(categories PRIVATE nvtx3-cpp)
+set_target_properties(categories PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(regstrings SHARED "RegisteredStrings.cpp")
+target_link_libraries(regstrings PRIVATE nvtx3-cpp)
+set_target_properties(regstrings PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(linkerdupes SHARED "LinkerDupesMain.cpp" "LinkerDupesFileA.cpp" "LinkerDupesFileB.cpp")
+target_link_libraries(linkerdupes PRIVATE nvtx3-cpp)
+set_target_properties(linkerdupes PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+add_library(export-api SHARED "ExportApi.c")
+target_link_libraries(export-api PRIVATE nvtx3-c)
+target_include_directories(export-api PRIVATE "Imports/cuda_lite" "Imports/opencl_lite")
+set_target_properties(export-api PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+target_link_options(export-api PRIVATE
+    $<$<AND:$<NOT:$<CXX_COMPILER_ID:MSVC>>,$<PLATFORM_ID:Windows>>:-Wl,--kill-at>
+)
+
+add_library(use-exported-api SHARED "UseExportedApi.cpp")
+target_link_libraries(use-exported-api PRIVATE nvtx3-cpp)
+target_include_directories(use-exported-api PRIVATE "Imports/cuda_lite" "Imports/opencl_lite")
+set_target_properties(use-exported-api PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+
+enable_testing()
+add_test(NAME "Self" COMMAND runtest -t self)
+add_test(NAME "Self with SelfInjection" COMMAND runtest -t self -i self)
+add_test(NAME "Self with PrintInjectionC" COMMAND runtest -t self -i inj-c)
+add_test(NAME "Self with PrintInjectionCpp" COMMAND runtest -t self -i inj-cpp)
+add_test(NAME "Calls" COMMAND runtest -t calls)
+add_test(NAME "Calls with CallsInjection" COMMAND runtest -t calls -i calls)
+add_test(NAME "Calls with PrintInjectionC" COMMAND runtest -t calls -i inj-c)
+add_test(NAME "Calls with PrintInjectionCpp" COMMAND runtest -t calls -i inj-cpp)
+add_test(NAME "CoverageC" COMMAND runtest -t coverage-c)
+add_test(NAME "CoverageC with PrintInjectionC" COMMAND runtest -t coverage-c -i inj-c)
+add_test(NAME "CoverageC with PrintInjectionCpp" COMMAND runtest -t coverage-c -i inj-cpp)
+if(NOT MSVC)
+    add_test(NAME "CoverageC with PrintInjectionC as StaticObjectC" COMMAND runtest -t coverage-c-static-obj-c)
+    add_test(NAME "CoverageC with PrintInjectionCpp as StaticObjectCpp" COMMAND runtest -t coverage-c-static-obj-cpp)
+    if(NOT APPLE)
+        add_test(NAME "CoverageC with PrintInjectionC as StaticLibraryC" COMMAND runtest -t coverage-c-static-lib-c)
+        add_test(NAME "CoverageC with PrintInjectionCpp as StaticLibraryCpp" COMMAND runtest -t coverage-c-static-lib-cpp)
+    endif()
+endif()
+add_test(NAME "CoverageC with PrintInjectionC as Dynamic" COMMAND runtest -t coverage-c -i inj-c-dynamic)
+add_test(NAME "CoverageC with PrintInjectionCpp as Dynamic" COMMAND runtest -t coverage-c -i inj-cpp-dynamic)
+add_test(NAME "CoverageC with PrintInjectionC as Preinjection" COMMAND runtest -t coverage-c-preinject)
+add_test(NAME "CoverageC with PrintInjectionCpp as Preinjection" COMMAND runtest -t coverage-c-preinject)
+add_test(NAME "CoverageCpp" COMMAND runtest -t coverage-cpp)
+add_test(NAME "CoverageCpp with PrintInjectionC" COMMAND runtest -t coverage-cpp -i inj-c)
+add_test(NAME "CoverageCpp with PrintInjectionCpp" COMMAND runtest -t coverage-cpp -i inj-cpp)
+if(NOT MSVC)
+    add_test(NAME "CoverageCpp with PrintInjectionC as StaticObjectC" COMMAND runtest -t coverage-cpp-static-obj-c)
+    add_test(NAME "CoverageCpp with PrintInjectionCpp as StaticObjectCpp" COMMAND runtest -t coverage-cpp-static-obj-cpp)
+    if(NOT APPLE)
+        add_test(NAME "CoverageCpp with PrintInjectionC as StaticLibraryC" COMMAND runtest -t coverage-cpp-static-lib-c)
+        add_test(NAME "CoverageCpp with PrintInjectionCpp as StaticLibraryCpp" COMMAND runtest -t coverage-cpp-static-lib-cpp)
+    endif()
+endif()
+add_test(NAME "CoverageCpp with PrintInjectionC as Dynamic" COMMAND runtest -t coverage-cpp -i inj-c-dynamic)
+add_test(NAME "CoverageCpp with PrintInjectionCpp as Dynamic" COMMAND runtest -t coverage-cpp -i inj-cpp-dynamic)
+add_test(NAME "CoverageCpp with PrintInjectionC as Preinjection" COMMAND runtest -t coverage-cpp-preinject)
+add_test(NAME "CoverageCpp with PrintInjectionCpp as Preinjection" COMMAND runtest -t coverage-cpp-preinject)
+if(ENABLE_CUDA)
+    add_test(NAME "CoverageCuda" COMMAND runtest -t coverage-cu)
+    add_test(NAME "CoverageCuda with PrintInjectionC" COMMAND runtest -t coverage-cu -i inj-c)
+    add_test(NAME "CoverageCuda with PrintInjectionCpp" COMMAND runtest -t coverage-cu -i inj-cpp)
+endif()
+add_test(NAME "CoverageExtMem" COMMAND runtest -t coverage-ext-mem)
+add_test(NAME "CoverageExtMem with PrintInjectionC" COMMAND runtest -t coverage-ext-mem -i inj-c)
+add_test(NAME "CoverageExtMem with PrintInjectionCpp" COMMAND runtest -t coverage-ext-mem -i inj-cpp)
+if(ENABLE_CUDA)
+    add_test(NAME "CoverageExtMemCudaRt" COMMAND runtest -t coverage-ext-memcudart)
+    add_test(NAME "CoverageExtMemCudaRt with PrintInjectionC" COMMAND runtest -t coverage-ext-memcudart -i inj-c)
+    add_test(NAME "CoverageExtMemCudaRt with PrintInjectionCpp" COMMAND runtest -t coverage-ext-memcudart -i inj-cpp)
+endif()
+add_test(NAME "CoverageExtPayload" COMMAND runtest -t coverage-ext-payload)
+add_test(NAME "CoverageExtPayload with PrintInjectionC" COMMAND runtest -t coverage-ext-payload -i inj-c)
+add_test(NAME "CoverageExtPayload with PrintInjectionCpp" COMMAND runtest -t coverage-ext-payload -i inj-cpp)
+add_test(NAME "CoverageExtCounter" COMMAND runtest -t coverage-ext-counter)
+add_test(NAME "CoverageExtCounter with PrintInjectionC" COMMAND runtest -t coverage-ext-counter -i inj-c)
+add_test(NAME "CoverageExtCounter with PrintInjectionCpp" COMMAND runtest -t coverage-ext-counter -i inj-cpp)
+add_test(NAME "Attributes" COMMAND runtest -t attributes)
+add_test(NAME "Attributes with PrintInjectionC" COMMAND runtest -t attributes -i inj-c)
+add_test(NAME "Attributes with PrintInjectionCpp" COMMAND runtest -t attributes -i inj-cpp)
+add_test(NAME "Domains" COMMAND runtest -t domains)
+add_test(NAME "Domains with PrintInjectionC" COMMAND runtest -t domains -i inj-c)
+add_test(NAME "Domains with PrintInjectionCpp" COMMAND runtest -t domains -i inj-cpp)
+add_test(NAME "NamedCategories" COMMAND runtest -t categories)
+add_test(NAME "NamedCategories with PrintInjectionC" COMMAND runtest -t categories -i inj-c)
+add_test(NAME "NamedCategories with PrintInjectionCpp" COMMAND runtest -t categories -i inj-cpp)
+add_test(NAME "RegisteredStrings" COMMAND runtest -t regstrings)
+add_test(NAME "RegisteredStrings with PrintInjectionC" COMMAND runtest -t regstrings -i inj-c)
+add_test(NAME "RegisteredStrings with PrintInjectionCpp" COMMAND runtest -t regstrings -i inj-cpp)
+add_test(NAME "LinkerDupes" COMMAND runtest -t linkerdupes)
+add_test(NAME "LinkerDupes with PrintInjectionC" COMMAND runtest -t linkerdupes -i inj-c)
+add_test(NAME "LinkerDupes with PrintInjectionCpp" COMMAND runtest -t linkerdupes -i inj-cpp)
+add_test(NAME "UseExportedApi" COMMAND runtest -t use-exported-api)
+add_test(NAME "UseExportedApi with PrintInjectionC" COMMAND runtest -t use-exported-api -i inj-c)
+add_test(NAME "UseExportedApi with PrintInjectionCpp" COMMAND runtest -t use-exported-api -i inj-cpp)
+
+if(APPLE)
+    set_property(TEST "CoverageC with PrintInjectionC as Preinjection"
+        PROPERTY
+        ENVIRONMENT DYLD_INSERT_LIBRARIES=${CMAKE_BINARY_DIR}/libinj-c-preinject.dylib)
+    set_property(TEST "CoverageC with PrintInjectionCpp as Preinjection"
+        PROPERTY
+        ENVIRONMENT DYLD_INSERT_LIBRARIES=${CMAKE_BINARY_DIR}/libinj-cpp-preinject.dylib)
+    set_property(TEST "CoverageCpp with PrintInjectionC as Preinjection"
+        PROPERTY
+        ENVIRONMENT DYLD_INSERT_LIBRARIES=${CMAKE_BINARY_DIR}/libinj-c-preinject.dylib)
+    set_property(TEST "CoverageCpp with PrintInjectionCpp as Preinjection"
+        PROPERTY
+        ENVIRONMENT DYLD_INSERT_LIBRARIES=${CMAKE_BINARY_DIR}/libinj-cpp-preinject.dylib)
+elseif(NOT WIN32)
+    set_property(TEST "CoverageC with PrintInjectionC as Preinjection"
+        PROPERTY
+        ENVIRONMENT LD_PRELOAD=${CMAKE_BINARY_DIR}/libinj-c-preinject.so)
+    set_property(TEST "CoverageC with PrintInjectionCpp as Preinjection"
+        PROPERTY
+        ENVIRONMENT LD_PRELOAD=${CMAKE_BINARY_DIR}/libinj-cpp-preinject.so)
+    set_property(TEST "CoverageCpp with PrintInjectionC as Preinjection"
+        PROPERTY
+        ENVIRONMENT LD_PRELOAD=${CMAKE_BINARY_DIR}/libinj-c-preinject.so)
+    set_property(TEST "CoverageCpp with PrintInjectionCpp as Preinjection"
+        PROPERTY
+        ENVIRONMENT LD_PRELOAD=${CMAKE_BINARY_DIR}/libinj-cpp-preinject.so)
+endif()
diff --git a/tests/Calls.cpp b/tests/Calls.cpp
new file mode 100644
index 0000000..7022d22
--- /dev/null
+++ b/tests/Calls.cpp
@@ -0,0 +1,433 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <nvtx3/nvtx3.hpp>
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "SelfInjection.h"
+#include "PrettyPrintersNvtxC.h"
+
+class CallbackTester
+{
+    Callbacks stored;
+    std::vector<Call> calls;
+public:
+
+public:
+    void Record(Call const& call) { calls.push_back(call); }
+
+    CallbackTester() : stored(g_callbacks)
+    {
+        g_callbacks.Default = [&](Call const& call) { Record(call); };
+    }
+    ~CallbackTester() { g_callbacks = stored; }
+
+    bool CallsMatch(std::vector<Call> expCalls, bool verbose = false) const
+    {
+        auto cmp = [&](Call const& lhs, Call const& rhs)
+        {
+            return Same(lhs, rhs, true, verbose, "NVTX call");
+        };
+
+        bool match = std::equal(calls.begin(), calls.end(), expCalls.begin(), cmp);
+        if (verbose && !match)
+        {
+            auto printCall = [](Call const& c) { std::cout << "    " << *c << "\n"; };
+            std::cout << "Did not get expected NVTX C API call sequence!  Expected:\n";
+            for (auto& c : expCalls) printCall(c);
+            std::cout << "Recorded:\n";
+            for (auto& c : calls) printCall(c);
+        }
+
+        return match;
+    }
+};
+
+template <int N> struct a_lib { static constexpr const char* name = "LibA"; };
+template <int N> struct b_lib { static constexpr const char* name = "LibB"; };
+template <int N> struct c_lib { static constexpr const char* name = "LibC"; };
+
+template <int N> struct cat1 { static constexpr const char* name = "Cat1"; static constexpr const uint32_t id = 1; };
+template <int N> struct cat2 { static constexpr const char* name = "Cat2"; static constexpr const uint32_t id = 2; };
+template <int N> struct cat3 { static constexpr const char* name = "Cat3"; static constexpr const uint32_t id = 3; };
+
+template <int N> struct reg1 { static constexpr const char* message = "Reg1"; };
+template <int N> struct reg2 { static constexpr const char* message = "Reg2"; };
+template <int N> struct reg3 { static constexpr const char* message = "Reg3"; };
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    bool verbose = false;
+    const std::string verboseArg = "-v";
+    for (; *argv; ++argv)
+    {
+        if (*argv == verboseArg) verbose = true;
+    }
+
+    using namespace nvtx3;
+
+    //---------------------------- Tests --------------------------------------
+
+    if (verbose) std::cout << "--------- Testing injection loader\n";
+
+    {
+        CallbackTester t;
+
+        nvtxInitialize(nullptr);
+        nvtxInitialize(nullptr);
+
+        if (!t.CallsMatch({
+            CALL_LOAD(1),
+            CALL(CORE2, Initialize, nullptr),
+            CALL(CORE2, Initialize, nullptr)
+        }, verbose)) return 1;
+    }
+
+    if (verbose) std::cout << "--------- Testing C API\n";
+
+    {
+        CallbackTester t;
+
+        const char* teststr = "Testing 1 2 3!";
+        nvtxMarkA(teststr);
+
+        if (!t.CallsMatch({
+            CALL(CORE, MarkA, teststr)
+        }, verbose)) return 1;
+    }
+
+    {
+        CallbackTester t;
+
+        char teststr[] = "Testing 1 2 3!";
+        nvtxMarkA(teststr);
+        memcpy(teststr,  "Overwritten!!!", sizeof(teststr));
+
+        if (!t.CallsMatch({
+            CALL(CORE, MarkA, "Testing 1 2 3!")
+        }, verbose)) return 1;
+    }
+
+    {
+        CallbackTester t;
+
+        wchar_t teststr[] = L"Testing 1 2 3!";
+        nvtxMarkW(teststr);
+        memcpy(teststr,     L"Overwritten!!!", sizeof(teststr));
+
+        if (!t.CallsMatch({
+            CALL(CORE, MarkW, L"Testing 1 2 3!")
+        }, verbose)) return 1;
+    }
+
+    {
+        CallbackTester t;
+
+        nvtxEventAttributes_t attr{NVTX_VERSION, sizeof(nvtxEventAttributes_t)};
+        attr.category = 123;
+        attr.colorType = NVTX_COLOR_ARGB;
+        attr.color = 0xFF4466BB;
+        attr.messageType = NVTX_MESSAGE_TYPE_ASCII;
+        attr.message = MakeMessage("Test MarkEX");
+        attr.category = 123;
+        attr.payloadType = NVTX_PAYLOAD_TYPE_DOUBLE;
+        attr.payload = MakePayload(3.14159);
+        nvtxMarkEx(&attr);
+
+        nvtxEventAttributes_t attr2 = attr;
+        memset(&attr, 0, sizeof(attr));
+
+        if (!t.CallsMatch({
+            CALL(CORE, MarkEx, &attr2)
+        }, verbose)) return 1;
+    }
+
+    if (verbose) std::cout << "--------- Testing C++ API\n";
+
+    {
+        CallbackTester t;
+
+        mark("Testing 1 2 3!");
+        mark(L"Testing 1 2 3!");
+
+        if (!t.CallsMatch({
+            CALL(CORE2, DomainMarkEx, nullptr, event_attributes{"Testing 1 2 3!"}.get()),
+            CALL(CORE2, DomainMarkEx, nullptr, event_attributes{L"Testing 1 2 3!"}.get())
+        }, verbose)) return 1;
+    }
+
+    {
+        CallbackTester t;
+
+        nvtxEventAttributes_t attrExpected{NVTX_VERSION, sizeof(nvtxEventAttributes_t),
+            123, // category
+            NVTX_COLOR_ARGB, 0xFF4466BB,
+            NVTX_PAYLOAD_TYPE_DOUBLE, 0, MakePayload(3.14159),
+            NVTX_MESSAGE_TYPE_ASCII, MakeMessage("Test msg")
+        };
+
+        // Same args, different order
+        mark("Test msg", rgb(0x44, 0x66, 0xBB), category(123), payload(3.14159));
+        mark(payload(3.14159), "Test msg", rgb(0x44, 0x66, 0xBB), category(123));
+        mark(category(123), payload(3.14159), "Test msg", rgb(0x44, 0x66, 0xBB));
+        mark(rgb(0x44, 0x66, 0xBB), category(123), payload(3.14159), "Test msg");
+
+        // Same args with duplicates, test first-one-wins behavior (including union type changes)
+        mark("Test msg", rgb(0x44, 0x66, 0xBB), category(123), payload(3.14159),
+            "Bad msg", rgb(0x10, 0x20, 0x30), category(321), payload(3.0));
+        mark("Test msg", rgb(0x44, 0x66, 0xBB), category(123), payload(3.14159),
+            L"Bad message");
+        mark("Test msg", rgb(0x44, 0x66, 0xBB), category(123), payload(3.14159),
+            payload(3.14159f));
+
+        if (!t.CallsMatch({
+            7, CALL(CORE2, DomainMarkEx, nullptr, &attrExpected)
+        }, verbose)) return 1;
+    }
+
+    {
+        CallbackTester t;
+        constexpr int N = 1;
+        auto hA = (nvtxDomainHandle_t)1;
+
+        mark_in<a_lib<N>>("First call");
+        mark_in<a_lib<N>>("Second call");
+        mark_in<a_lib<N>>("Third call");
+
+        if (!t.CallsMatch({
+            CALL(CORE2, DomainCreateA, "LibA"),
+            CALL(CORE2, DomainMarkEx,  hA, event_attributes{"First call"}.get()),
+            CALL(CORE2, DomainMarkEx,  hA, event_attributes{"Second call"}.get()),
+            CALL(CORE2, DomainMarkEx,  hA, event_attributes{"Third call"}.get())
+        }, verbose)) return 1;
+    }
+
+    {
+        CallbackTester t;
+        constexpr int N = 2;
+        auto hA = (nvtxDomainHandle_t)1;
+        auto hB = (nvtxDomainHandle_t)2;
+
+        mark_in<a_lib<N>>("First call");
+        mark_in<a_lib<N>>("Second call");
+        mark_in<b_lib<N>>("First call");
+        mark_in<b_lib<N>>("Second call");
+
+        if (!t.CallsMatch({
+            CALL(CORE2, DomainCreateA, "LibA"),
+            CALL(CORE2, DomainMarkEx,  hA, event_attributes{"First call"}.get()),
+            CALL(CORE2, DomainMarkEx,  hA, event_attributes{"Second call"}.get()),
+            CALL(CORE2, DomainCreateA, "LibB"),
+            CALL(CORE2, DomainMarkEx,  hB, event_attributes{"First call"}.get()),
+            CALL(CORE2, DomainMarkEx,  hB, event_attributes{"Second call"}.get())
+        }, verbose)) return 1;
+    }
+
+    {
+        CallbackTester t;
+        constexpr int N = 3;
+        auto hA = (nvtxDomainHandle_t)1;
+        auto hB = (nvtxDomainHandle_t)2;
+
+        mark_in<a_lib<N>>("DA, Cat 1, call 1", named_category_in<a_lib<N>>::get<cat1<N>>());
+        mark_in<a_lib<N>>("DA, Cat 1, call 2", named_category_in<a_lib<N>>::get<cat1<N>>());
+        mark_in<a_lib<N>>("DA, Cat 2, call 1", named_category_in<a_lib<N>>::get<cat2<N>>());
+        mark_in<a_lib<N>>("DA, Cat 2, call 2", named_category_in<a_lib<N>>::get<cat2<N>>());
+        mark_in<b_lib<N>>("DB, Cat 1, call 1", named_category_in<b_lib<N>>::get<cat1<N>>());
+        mark_in<b_lib<N>>("DB, Cat 1, call 2", named_category_in<b_lib<N>>::get<cat1<N>>());
+        mark_in<b_lib<N>>("DB, Cat 2, call 1", named_category_in<b_lib<N>>::get<cat2<N>>());
+        mark_in<b_lib<N>>("DB, Cat 2, call 2", named_category_in<b_lib<N>>::get<cat2<N>>());
+
+        if (!t.CallsMatch({
+            CALL(CORE2, DomainCreateA,       "LibA"),
+            CALL(CORE2, DomainNameCategoryA, hA, 1, "Cat1"),
+            CALL(CORE2, DomainMarkEx,        hA, event_attributes{"DA, Cat 1, call 1", category(1)}.get()),
+            CALL(CORE2, DomainMarkEx,        hA, event_attributes{"DA, Cat 1, call 2", category(1)}.get()),
+            CALL(CORE2, DomainNameCategoryA, hA, 2, "Cat2"),
+            CALL(CORE2, DomainMarkEx,        hA, event_attributes{"DA, Cat 2, call 1", category(2)}.get()),
+            CALL(CORE2, DomainMarkEx,        hA, event_attributes{"DA, Cat 2, call 2", category(2)}.get()),
+            CALL(CORE2, DomainCreateA,       "LibB"),
+            CALL(CORE2, DomainNameCategoryA, hB, 1, "Cat1"),
+            CALL(CORE2, DomainMarkEx,        hB, event_attributes{"DB, Cat 1, call 1", category(1)}.get()),
+            CALL(CORE2, DomainMarkEx,        hB, event_attributes{"DB, Cat 1, call 2", category(1)}.get()),
+            CALL(CORE2, DomainNameCategoryA, hB, 2, "Cat2"),
+            CALL(CORE2, DomainMarkEx,        hB, event_attributes{"DB, Cat 2, call 1", category(2)}.get()),
+            CALL(CORE2, DomainMarkEx,        hB, event_attributes{"DB, Cat 2, call 2", category(2)}.get()),
+        }, verbose)) return 1;
+    }
+
+    {
+        CallbackTester t;
+        constexpr int N = 4;
+        auto hA = (nvtxDomainHandle_t)1;
+        auto hB = (nvtxDomainHandle_t)2;
+        auto hReg1 = (nvtxStringHandle_t)1;
+        auto hReg2 = (nvtxStringHandle_t)2;
+
+        mark_in<a_lib<N>>(registered_string_in<a_lib<N>>::get<reg1<N>>());
+        mark_in<a_lib<N>>(registered_string_in<a_lib<N>>::get<reg1<N>>());
+        mark_in<a_lib<N>>(registered_string_in<a_lib<N>>::get<reg2<N>>());
+        mark_in<a_lib<N>>(registered_string_in<a_lib<N>>::get<reg2<N>>());
+        mark_in<b_lib<N>>(registered_string_in<b_lib<N>>::get<reg1<N>>());
+        mark_in<b_lib<N>>(registered_string_in<b_lib<N>>::get<reg1<N>>());
+        mark_in<b_lib<N>>(registered_string_in<b_lib<N>>::get<reg2<N>>());
+        mark_in<b_lib<N>>(registered_string_in<b_lib<N>>::get<reg2<N>>());
+
+        if (!t.CallsMatch({
+            CALL(CORE2, DomainCreateA,         "LibA"),
+            CALL(CORE2, DomainRegisterStringA, hA, "Reg1"),
+            CALL(CORE2, DomainMarkEx,          hA, event_attributes{hReg1}.get()),
+            CALL(CORE2, DomainMarkEx,          hA, event_attributes{hReg1}.get()),
+            CALL(CORE2, DomainRegisterStringA, hA, "Reg2"),
+            CALL(CORE2, DomainMarkEx,          hA, event_attributes{hReg2}.get()),
+            CALL(CORE2, DomainMarkEx,          hA, event_attributes{hReg2}.get()),
+            CALL(CORE2, DomainCreateA,         "LibB"),
+            CALL(CORE2, DomainRegisterStringA, hB, "Reg1"),
+            CALL(CORE2, DomainMarkEx,          hB, event_attributes{hReg1}.get()),
+            CALL(CORE2, DomainMarkEx,          hB, event_attributes{hReg1}.get()),
+            CALL(CORE2, DomainRegisterStringA, hB, "Reg2"),
+            CALL(CORE2, DomainMarkEx,          hB, event_attributes{hReg2}.get()),
+            CALL(CORE2, DomainMarkEx,          hB, event_attributes{hReg2}.get()),
+        }, verbose)) return 1;
+    }
+
+    {
+        CallbackTester t;
+        constexpr int N = 5;
+        auto hA = (nvtxDomainHandle_t)1;
+        auto hB = (nvtxDomainHandle_t)2;
+        auto hReg1 = (nvtxStringHandle_t)1;
+        auto hReg2 = (nvtxStringHandle_t)2;
+
+        auto& a_regstr1 = registered_string_in<a_lib<N>>::get<reg1<N>>();
+        auto& a_regstr2 = registered_string_in<a_lib<N>>::get<reg2<N>>();
+        auto& b_regstr1 = registered_string_in<b_lib<N>>::get<reg1<N>>();
+        auto& b_regstr2 = registered_string_in<b_lib<N>>::get<reg2<N>>();
+
+        auto& a_cat1 = named_category_in<a_lib<N>>::get<cat1<N>>();
+        auto& a_cat2 = named_category_in<a_lib<N>>::get<cat2<N>>();
+        auto& b_cat1 = named_category_in<b_lib<N>>::get<cat1<N>>();
+        auto& b_cat2 = named_category_in<b_lib<N>>::get<cat2<N>>();
+
+        mark_in<a_lib<N>>(a_cat1, a_regstr1);
+        mark_in<a_lib<N>>(a_cat1, a_regstr1);
+        mark_in<a_lib<N>>(a_cat2, a_regstr2);
+        mark_in<a_lib<N>>(a_cat2, a_regstr2);
+        mark_in<b_lib<N>>(b_cat1, b_regstr1);
+        mark_in<b_lib<N>>(b_cat1, b_regstr1);
+        mark_in<b_lib<N>>(b_cat2, b_regstr2);
+        mark_in<b_lib<N>>(b_cat2, b_regstr2);
+
+        if (!t.CallsMatch({
+            CALL(CORE2, DomainCreateA,         "LibA"),
+            CALL(CORE2, DomainRegisterStringA, hA, "Reg1"),
+            CALL(CORE2, DomainRegisterStringA, hA, "Reg2"),
+            CALL(CORE2, DomainCreateA,         "LibB"),
+            CALL(CORE2, DomainRegisterStringA, hB, "Reg1"),
+            CALL(CORE2, DomainRegisterStringA, hB, "Reg2"),
+            CALL(CORE2, DomainNameCategoryA,   hA, 1, "Cat1"),
+            CALL(CORE2, DomainNameCategoryA,   hA, 2, "Cat2"),
+            CALL(CORE2, DomainNameCategoryA,   hB, 1, "Cat1"),
+            CALL(CORE2, DomainNameCategoryA,   hB, 2, "Cat2"),
+            CALL(CORE2, DomainMarkEx,          hA, event_attributes{hReg1, category(1)}.get()),
+            CALL(CORE2, DomainMarkEx,          hA, event_attributes{hReg1, category(1)}.get()),
+            CALL(CORE2, DomainMarkEx,          hA, event_attributes{hReg2, category(2)}.get()),
+            CALL(CORE2, DomainMarkEx,          hA, event_attributes{hReg2, category(2)}.get()),
+            CALL(CORE2, DomainMarkEx,          hB, event_attributes{hReg1, category(1)}.get()),
+            CALL(CORE2, DomainMarkEx,          hB, event_attributes{hReg1, category(1)}.get()),
+            CALL(CORE2, DomainMarkEx,          hB, event_attributes{hReg2, category(2)}.get()),
+            CALL(CORE2, DomainMarkEx,          hB, event_attributes{hReg2, category(2)}.get()),
+        }, verbose)) return 1;
+    }
+
+    {
+        CallbackTester t;
+        constexpr int N = 6;
+        auto hA = (nvtxDomainHandle_t)1;
+        auto hB = (nvtxDomainHandle_t)2;
+
+        {
+            scoped_range_in<a_lib<N>> r1("Sequential range 1");
+            mark_in<a_lib<N>>("Mark in range");
+        }
+        {
+            scoped_range_in<a_lib<N>> r2("Sequential range 2");
+            mark_in<a_lib<N>>("Mark in range");
+        }
+        {
+            scoped_range_in<a_lib<N>> r1("Nested range 1");
+            scoped_range_in<a_lib<N>> r2("Nested range 2");
+            mark_in<a_lib<N>>("Mark in range");
+        }
+
+        {
+            scoped_range_in<b_lib<N>> r1("Sequential range 1");
+            mark_in<b_lib<N>>("Mark in range");
+        }
+        {
+            scoped_range_in<b_lib<N>> r2("Sequential range 2");
+            mark_in<b_lib<N>>("Mark in range");
+        }
+        {
+            scoped_range_in<b_lib<N>> r1("Nested range 1");
+            scoped_range_in<b_lib<N>> r2("Nested range 2");
+            mark_in<b_lib<N>>("Mark in range");
+        }
+
+        if (!t.CallsMatch({
+            CALL(CORE2, DomainCreateA,     "LibA"),
+            CALL(CORE2, DomainRangePushEx, hA, event_attributes{"Sequential range 1"}.get()),
+            CALL(CORE2, DomainMarkEx,      hA, event_attributes{"Mark in range"}.get()),
+            CALL(CORE2, DomainRangePop,    hA),
+            CALL(CORE2, DomainRangePushEx, hA, event_attributes{"Sequential range 2"}.get()),
+            CALL(CORE2, DomainMarkEx,      hA, event_attributes{"Mark in range"}.get()),
+            CALL(CORE2, DomainRangePop,    hA),
+            CALL(CORE2, DomainRangePushEx, hA, event_attributes{"Nested range 1"}.get()),
+            CALL(CORE2, DomainRangePushEx, hA, event_attributes{"Nested range 2"}.get()),
+            CALL(CORE2, DomainMarkEx,      hA, event_attributes{"Mark in range"}.get()),
+            CALL(CORE2, DomainRangePop,    hA),
+            CALL(CORE2, DomainRangePop,    hA),
+            CALL(CORE2, DomainCreateA,     "LibB"),
+            CALL(CORE2, DomainRangePushEx, hB, event_attributes{"Sequential range 1"}.get()),
+            CALL(CORE2, DomainMarkEx,      hB, event_attributes{"Mark in range"}.get()),
+            CALL(CORE2, DomainRangePop,    hB),
+            CALL(CORE2, DomainRangePushEx, hB, event_attributes{"Sequential range 2"}.get()),
+            CALL(CORE2, DomainMarkEx,      hB, event_attributes{"Mark in range"}.get()),
+            CALL(CORE2, DomainRangePop,    hB),
+            CALL(CORE2, DomainRangePushEx, hB, event_attributes{"Nested range 1"}.get()),
+            CALL(CORE2, DomainRangePushEx, hB, event_attributes{"Nested range 2"}.get()),
+            CALL(CORE2, DomainMarkEx,      hB, event_attributes{"Mark in range"}.get()),
+            CALL(CORE2, DomainRangePop,    hB),
+            CALL(CORE2, DomainRangePop,    hB),
+        }, verbose)) return 1;
+    }
+
+    if (verbose) std::cout << "--------- Success!\n";
+    return 0;
+}
diff --git a/tests/CoverageC.c b/tests/CoverageC.c
new file mode 100644
index 0000000..1aff136
--- /dev/null
+++ b/tests/CoverageC.c
@@ -0,0 +1,93 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <nvtx3/nvToolsExt.h>
+
+static void TestCore(void)
+{
+    nvtxEventAttributes_t attributes;
+    nvtxRangeId_t rangeId;
+
+    attributes.version = NVTX_VERSION;
+    attributes.size = sizeof(attributes);
+    attributes.category = 0;
+    attributes.colorType = NVTX_COLOR_ARGB;
+    attributes.color = 0xFF1133FF;
+    attributes.payloadType = NVTX_PAYLOAD_UNKNOWN;
+    attributes.payload.llValue = 0;
+    attributes.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    attributes.message.ascii = "Test message";
+
+    nvtxMarkEx(&attributes);
+    nvtxMarkA("MarkA");
+    nvtxMarkW(L"MarkW");
+    rangeId = nvtxRangeStartEx(&attributes);
+    nvtxRangeEnd(rangeId);
+    rangeId = nvtxRangeStartA("RangeStartA");
+    nvtxRangeEnd(rangeId);
+    rangeId = nvtxRangeStartW(L"RangeStartW");
+    nvtxRangeEnd(rangeId);
+    nvtxRangePushEx(&attributes);
+    nvtxRangePop();
+    nvtxRangePushA("RangePushA");
+    nvtxRangePop();
+    nvtxRangePushW(L"RangePushW");
+    nvtxRangePop();
+}
+
+static void TestCore2(void)
+{
+    nvtxEventAttributes_t attributes;
+    nvtxRangeId_t rangeId;
+    nvtxDomainHandle_t domain, domainW;
+
+    attributes.version = NVTX_VERSION;
+    attributes.size = sizeof(attributes);
+    attributes.category = 0;
+    attributes.colorType = NVTX_COLOR_ARGB;
+    attributes.color = 0xFF1133FF;
+    attributes.payloadType = NVTX_PAYLOAD_UNKNOWN;
+    attributes.payload.llValue = 0;
+    attributes.messageType = NVTX_MESSAGE_TYPE_ASCII;
+    attributes.message.ascii = "Test message";
+
+    domain = nvtxDomainCreateA("DomainA");
+    domainW = nvtxDomainCreateW(L"DomainW");
+
+    nvtxDomainMarkEx(domain, &attributes);
+    rangeId = nvtxDomainRangeStartEx(domain, &attributes);
+    nvtxDomainRangeEnd(domain, rangeId);
+    nvtxDomainRangePushEx(domain, &attributes);
+    nvtxDomainRangePop(domain);
+}
+
+NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    (void)argc;
+    (void)argv;
+
+    TestCore();
+    TestCore2();
+
+    return 0;
+}
diff --git a/tests/CoverageCpp.cpp b/tests/CoverageCpp.cpp
new file mode 100644
index 0000000..e9cb45a
--- /dev/null
+++ b/tests/CoverageCpp.cpp
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include "TestCoverage.h"
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    return RunTestCommon(argc, argv);
+}
diff --git a/tests/CoverageCuda.cu b/tests/CoverageCuda.cu
new file mode 100644
index 0000000..e9cb45a
--- /dev/null
+++ b/tests/CoverageCuda.cu
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include "TestCoverage.h"
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    return RunTestCommon(argc, argv);
+}
diff --git a/tests/CoverageExtCounter.c b/tests/CoverageExtCounter.c
new file mode 100644
index 0000000..0f3d7ec
--- /dev/null
+++ b/tests/CoverageExtCounter.c
@@ -0,0 +1,50 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <nvtx3/nvToolsExtCounters.h>
+
+static void TestMem(void)
+{
+    nvtxDomainHandle_t domain;
+    uint64_t counter;
+    nvtxCounterAttr_t attr;
+    int64_t i64 = 0;
+    double f64 = 0.0;
+
+    domain = nvtxDomainCreateA("Domain");
+
+    counter = nvtxCounterRegister(domain, &attr);
+    nvtxCounterSampleInt64(domain, counter, i64);
+    nvtxCounterSampleFloat64(domain, counter, f64);
+    nvtxCounterSampleNoValue(domain, counter, NVTX_COUNTER_SAMPLE_UNCHANGED);
+}
+
+NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    (void)argc;
+    (void)argv;
+
+    TestMem();
+
+    return 0;
+}
diff --git a/tests/CoverageExtMem.c b/tests/CoverageExtMem.c
new file mode 100644
index 0000000..b8b69f4
--- /dev/null
+++ b/tests/CoverageExtMem.c
@@ -0,0 +1,46 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <nvtx3/nvToolsExtMem.h>
+
+static void TestMem(void)
+{
+    nvtxDomainHandle_t domain;
+    nvtxMemHeapHandle_t heap;
+    nvtxMemHeapDesc_t heapDesc;
+
+    domain = nvtxDomainCreateA("Domain");
+
+    heap = nvtxMemHeapRegister(domain, &heapDesc);
+    nvtxMemPermissionsUnbind(domain, 0);
+}
+
+NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    (void)argc;
+    (void)argv;
+
+    TestMem();
+
+    return 0;
+}
diff --git a/tests/CoverageExtMemCudaRt.cu b/tests/CoverageExtMemCudaRt.cu
new file mode 100644
index 0000000..6efb5ab
--- /dev/null
+++ b/tests/CoverageExtMemCudaRt.cu
@@ -0,0 +1,45 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <nvtx3/nvToolsExtMemCudaRt.h>
+
+static void TestMemCudaRt(void)
+{
+    nvtxDomainHandle_t domain;
+    nvtxMemPermissionsHandle_t perm;
+
+    domain = nvtxDomainCreateA("Domain");
+
+    perm = nvtxMemCudaGetProcessWidePermissions(domain);
+    nvtxMemCudaSetPeerAccess(domain, perm, 0, 0);
+}
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    (void)argc;
+    (void)argv;
+
+    TestMemCudaRt();
+
+    return 0;
+}
diff --git a/tests/CoverageExtPayload.c b/tests/CoverageExtPayload.c
new file mode 100644
index 0000000..051b73b
--- /dev/null
+++ b/tests/CoverageExtPayload.c
@@ -0,0 +1,46 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <nvtx3/nvToolsExtPayload.h>
+
+static void TestMem(void)
+{
+    nvtxDomainHandle_t domain;
+    uint8_t enabled;
+    uint64_t handle;
+    nvtxPayloadSchemaAttr_t attr;
+
+    domain = nvtxDomainCreateA("Domain");
+    enabled = nvtxDomainIsEnabled(domain);
+    handle = nvtxPayloadSchemaRegister(domain, &attr);
+}
+
+NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    (void)argc;
+    (void)argv;
+
+    TestMem();
+
+    return 0;
+}
diff --git a/tests/DllHelper.h b/tests/DllHelper.h
new file mode 100644
index 0000000..707cbb9
--- /dev/null
+++ b/tests/DllHelper.h
@@ -0,0 +1,73 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#pragma once
+
+/* To export a function from a DLL, include nvtx3/nvToolsExt.h and use:
+ * - Use extern "C" (if C++) and NVTX_DYNAMIC_EXPORT in front of the function declaration/definition
+ * - Use NVTX_EXPORT_UNMANGLED_FUNCTION_NAME inside the function body to prevent name-mangling
+ *
+ * On GCC and similar compilers, it's best to build with -fvisibility=hidden.  This ensures normal
+ * functions will not be dynamic exports.  In CMake, that can be done with:
+ *   set_target_properties(MyTarget PROPERTIES C_VISIBILITY_PRESET hidden CXX_VISIBILITY_PRESET hidden)
+ *
+ * If you can't build with that flag, then push visibility=hidden and never pop it:
+ *   #ifdef __GNUC__
+ *   #pragma GCC visibility push(hidden)
+ *   #endif
+ *
+ * Note that NVTX_DYNAMIC_EXPORT will export a function even if the default visibility is hidden.
+ * NVTX_EXPORT_UNMANGLED_FUNCTION_NAME isn't necessary on many platforms, but using it will ensure
+ * success when loading function pointers via GET_DLL_FUNC (see below) on any platform, and from
+ * other languages' C bindings.
+ */
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+/* Don't try to use wide chars here -- stick with char* for simpler cross-plat coding */
+#define DLL_HANDLE     HMODULE
+#define DLL_OPEN(x)    LoadLibraryA(x)
+#define DLL_CLOSE(x)   FreeLibraryA(x)
+#define GET_DLL_FUNC   GetProcAddress
+#if defined(_MSC_VER)
+#define DLL_PREFIX     ""
+#else
+#define DLL_PREFIX     "lib"
+#endif
+#define DLL_SUFFIX     ".dll"
+
+#else /* Assume GCC-like compiler, but don't require defined(__GNUC__) */
+
+#include <dlfcn.h>
+
+#define DLL_HANDLE     void*
+#define DLL_OPEN(lib)  dlopen(lib, RTLD_LAZY)
+#define DLL_CLOSE(h)   dlclose(h)
+#define GET_DLL_FUNC   dlsym
+#define DLL_PREFIX     "lib"
+#if defined(__APPLE__)
+#define DLL_SUFFIX     ".dylib"
+#else
+#define DLL_SUFFIX     ".so"
+#endif
+
+#endif
diff --git a/tests/Domains.cpp b/tests/Domains.cpp
new file mode 100644
index 0000000..4500092
--- /dev/null
+++ b/tests/Domains.cpp
@@ -0,0 +1,117 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#if defined(_MSC_VER) && _MSC_VER < 1914
+#define STATIC_ASSERT_TESTING 0
+#else
+#define STATIC_ASSERT_TESTING 1
+#endif
+
+#if defined(STATIC_ASSERT_TESTING)
+#include <stdio.h>
+#define NVTX3_STATIC_ASSERT(c, m) do { if (!(c)) printf("static_assert would fail: %s\n", m); } while (0)
+#endif
+
+#include <nvtx3/nvtx3.hpp>
+
+#include <iostream>
+
+// Domain description types
+struct char_test              { static constexpr const char*    name{"Test char"}; };
+struct wchar_test             { static constexpr const wchar_t* name{L"Test wchar_t"}; };
+struct error_name_missing     { static constexpr const char*    not_name{"Test name is missing"}; };
+struct error_name_is_bad_type { static constexpr const int      name{5}; };
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    (void)argc;
+    (void)argv;
+
+    using namespace nvtx3;
+
+    if (0)
+    {
+        std::cout << std::boolalpha;
+        std::cout << "is_c_string<const char *>     = " << detail::is_c_string<const char*>::value << '\n';
+        std::cout << "is_c_string<const wchar_t *>  = " << detail::is_c_string<const wchar_t*>::value << '\n';
+        std::cout << "is_c_string<const char *&>    = " << detail::is_c_string<const char*&>::value << '\n';
+        std::cout << "is_c_string<const wchar_t *&> = " << detail::is_c_string<const wchar_t*&>::value << '\n';
+        std::cout << "is_c_string<const char *c&>   = " << detail::is_c_string<const char* const&>::value << '\n';
+        std::cout << "is_c_string<const wchar_t *c&>= " << detail::is_c_string<const wchar_t* const&>::value << '\n';
+        std::cout << "is_c_string<char *>     = " << detail::is_c_string<char*>::value << '\n';
+        std::cout << "is_c_string<wchar_t *>  = " << detail::is_c_string<wchar_t*>::value << '\n';
+        std::cout << "is_c_string<char *&>    = " << detail::is_c_string<char*&>::value << '\n';
+        std::cout << "is_c_string<wchar_t *&> = " << detail::is_c_string<wchar_t*&>::value << '\n';
+        std::cout << "is_c_string<char *c&>   = " << detail::is_c_string<char* const&>::value << '\n';
+        std::cout << "is_c_string<wchar_t *c&>= " << detail::is_c_string<wchar_t* const&>::value << '\n';
+
+        std::cout << "is_c_string<int>       = " << detail::is_c_string<int>::value << '\n';
+        std::cout << "is_c_string<const int*>= " << detail::is_c_string<const int*>::value << '\n';
+        std::cout << "is_c_string<void*>     = " << detail::is_c_string<void*>::value << '\n';
+
+        std::cout << "-------------\n";
+    }
+
+    std::cout << "- Global domain (mark alias):\n";
+    mark("Mark in global domain (implicit)");
+
+    std::cout << "- Global domain implicit:\n";
+    auto& gi = domain::get<>();
+    mark_in<>("Mark in global domain (implicit)");
+
+    std::cout << "- Global domain explicit:\n";
+    auto& ge = domain::get<domain::global>();
+    mark_in<domain::global>("Mark in global domain (explicit)");
+
+    std::cout << "- Test domain (char):\n";
+    auto& d1 = domain::get<char_test>();
+    mark_in<char_test>("Mark in char_test domain");
+
+    std::cout << "- Test domain (wchar_t):\n";
+    auto& d2 = domain::get<wchar_test>();
+    mark_in<wchar_test>("Mark in wchar_test domain");
+
+#if STATIC_ASSERT_TESTING
+
+#if 1 // defined(ERROR_TEST_NAME_IS_MISSING)
+    {
+        std::cout << "- Error test - domain is missing name member:\n";
+        auto& d3 = domain::get<error_name_missing>();
+        mark_in<error_name_missing>("Mark in error_name_missing domain");
+        scoped_range_in<error_name_missing> r3("Mark in error_name_missing domain");
+    }
+#endif
+
+#if 1 // defined(ERROR_TEST_NAME_IS_BAD_TYPE)
+    {
+        std::cout << "- Error test - domain name member isn't narrow or wide char array:\n";
+        auto& d4 = domain::get<error_name_is_bad_type>();
+        mark_in<error_name_is_bad_type>("Mark in error_name_is_bad_type domain");
+        scoped_range_in<error_name_is_bad_type> r4("Mark in error_name_is_bad_type domain");
+    }
+#endif
+
+#endif // STATIC_ASSERT_TESTING
+
+    return 0;
+}
diff --git a/tests/ExportApi.c b/tests/ExportApi.c
new file mode 100644
index 0000000..114b2ae
--- /dev/null
+++ b/tests/ExportApi.c
@@ -0,0 +1,26 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#define NVTX_EXPORT_API
+#include <nvtx3/nvToolsExt.h>
+#include <nvtx3/nvToolsExtCuda.h>
+#include <nvtx3/nvToolsExtCudaRt.h>
+#include <nvtx3/nvToolsExtOpenCL.h>
+#include <nvtx3/nvToolsExtSync.h>
diff --git a/tests/Imports/cuda_lite/cuda.h b/tests/Imports/cuda_lite/cuda.h
new file mode 100644
index 0000000..7c2ad51
--- /dev/null
+++ b/tests/Imports/cuda_lite/cuda.h
@@ -0,0 +1,9 @@
+#ifndef CUDA_H_
+#define CUDA_H_
+
+typedef int CUdevice;
+typedef struct CUctx_st *CUcontext;
+typedef struct CUstream_st *CUstream;
+typedef struct CUevent_st *CUevent;
+
+#endif /* CUDA_H_ */
diff --git a/tests/Imports/cuda_lite/driver_types.h b/tests/Imports/cuda_lite/driver_types.h
new file mode 100644
index 0000000..b74c268
--- /dev/null
+++ b/tests/Imports/cuda_lite/driver_types.h
@@ -0,0 +1,7 @@
+#ifndef DRIVER_TYPES_H_
+#define DRIVER_TYPES_H_
+
+typedef struct CUstream_st *cudaStream_t;
+typedef struct CUevent_st *cudaEvent_t;
+
+#endif /* DRIVER_TYPES_H_ */
diff --git a/tests/Imports/opencl_lite/CL/cl.h b/tests/Imports/opencl_lite/CL/cl.h
new file mode 100644
index 0000000..1c333fa
--- /dev/null
+++ b/tests/Imports/opencl_lite/CL/cl.h
@@ -0,0 +1,14 @@
+#ifndef CL_H_
+#define CL_H_
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+#endif /* CL_H_ */
diff --git a/tests/InjectionHelper.h b/tests/InjectionHelper.h
new file mode 100644
index 0000000..3b17dbd
--- /dev/null
+++ b/tests/InjectionHelper.h
@@ -0,0 +1,580 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#pragma once
+
+// [Best practices for injection implementions]
+// Set NVTX_NO_IMPL to make the NVTX headers define the API types and function
+// prototypes only, not the inline impls.  Be sure on GCC to use -Wno-unused-function
+// to avoid warnings for undefined static prototypes.
+#define NVTX_NO_IMPL
+
+// [Best practices for injection implementions]
+// Microsoft's compiler issues warning 26812 when compiling a C-style enum in C++
+// instead of using the new "enum class" style.  Since the NVTX headers are written in
+// C, the enums defined there will trigger this warning.  Use this code to disable it.
+#if defined(_MSC_VER)
+#pragma warning (disable : 26812)
+#endif
+#include <nvtx3/nvToolsExt.h>
+
+#include <type_traits>
+#include <utility>
+#include <tuple>
+#include <limits>
+
+namespace NvtxInjectionHelper {
+
+//============ Generic utility functions ======================================
+
+inline namespace detail_generic {
+
+//--- maxVal ---
+
+// Variadic alternative to std::max that doesn't need an initializer list,
+// doesn't conflict with MSVC's #define for max, and has no trouble with
+// constexpr usage.  Handles having zero parameters passed, returning
+// std::numeric_limits<T>::min in that case, as long as the template
+// parameter T is explicitly specified.  Takes arguments by value, which
+// avoids the issue of returning a reference to something when called
+// with no parameters.  Example uses:
+//
+
+template <typename T>
+constexpr inline T maxVal() { return std::numeric_limits<T>::min(); }
+
+template <typename T, typename... Rest>
+constexpr inline T maxVal(T first, Rest... rest)
+{
+    T restMax = maxVal<T>(rest...);
+    return (first > restMax) ? first : restMax;
+}
+
+//--- tuple size helper ---
+
+// Generic utility for getting the size of a std::tuple, using its value
+// as opposed to std::tuple_size<> which takes the tuple's type.  In a
+// generic lambda where the parameter's type is "auto", it's extra work
+// to figure out the type
+template <typename... Ts>
+constexpr inline size_t size_of_tuple(std::tuple<Ts...> const&)
+{
+    return sizeof...(Ts);
+}
+
+//--- tuple helpers to loop over items ---
+
+// We need a way to call a function f on each element of a tuple, like this:
+//
+//     f(std::get<0>(t));
+//     f(std::get<1>(t));
+//     f(std::get<2>(t)); etc.
+//
+// We want something like this, where "Is" is a parameter pack of 0,1,2,etc.:
+//
+//     f(std::get<Is>(t))...;
+//
+// ...but parameter pack expansion is only allowed within the context of args
+// to a function call or a braced init list.  We also must handle the case
+// where the tuple is empty, we should discard the results of all the calls
+// to f, even if it returns different types for each call.  Easiest way to
+// do this is by forwarding the elements of the tuple as args to a helper
+// function that calls f on each arg, like this:
+//
+//    for_each_in_parameter_pack(f, std::get<Is>(t)...);
+//
+// But we also want perfect forwarding of the function and the tuple.
+// The following utilites "for_each_in_tuple", "for_each_in_tuple_helper",
+// and "for_each_in_parameter_pack" are provided to allow code such as this
+// "loop" over tuple elements.  Note that "thing" in each iteration can be
+// a different type, because a tuple's elements may be different types, so
+// generic lambdas are very convenient here:
+//
+//    for_each_in_tuple(tuple_of_things,
+//        [](auto const& thing)
+//        {
+//            std::cout << thing << std::endl;
+//        }
+//    );
+
+template<typename F>
+inline void for_each_in_parameter_pack(F&& f) {}
+
+template<typename F, typename First, typename... Rest>
+inline void for_each_in_parameter_pack(F&& f, First const& first, Rest const&... rest)
+{
+    // Call f on the first argument, and explicitly discard the result by casting to void
+    static_cast<void>(std::forward<F>(f)(first));
+
+    // Recurse to call f on the rest of the arguments
+    for_each_in_parameter_pack(std::forward<F>(f), rest...);
+}
+
+// Generic utility for calling a function f for each element of a tuple t
+template<typename T, typename F, size_t... Is>
+inline void for_each_in_tuple_helper(T const& t, F&& f, std::index_sequence<Is...>)
+{
+    for_each_in_parameter_pack(
+        std::forward<F>(f),
+        std::get<Is>(t)...
+    );
+}
+template<typename... Ts, typename F>
+inline void for_each_in_tuple(std::tuple<Ts...> const& t, F&& f)
+{
+    for_each_in_tuple_helper(t, std::forward<F>(f), std::make_index_sequence<sizeof...(Ts)>());
+}
+
+} // namespace detail_generic
+
+//============ NVTX injection helper internal utilities =======================
+
+inline namespace detail_nvtx {
+
+//--- id_t ---
+// Define generic integer type for holding all modules' callback id enum values.
+// These are used as indexes into the handler arrays for each module.
+using id_t = unsigned int;
+
+//--- id_v ---
+// Nickname for std::integral_constant, which is used for all callback enum values.
+// Using an integral constant allows performing correctness checks at compile time,
+// which is not possible in C++ with function parameter values, only their types.
+// Including the value in the type works around this problem.
+template <typename EnumT, EnumT EnumVal>
+using id_v = std::integral_constant<EnumT, EnumVal>;
+
+//--- NVTX_CBID ---
+// Macro to succinctly turn an NVTX_CBID_* enum value into a compile-time constant,
+// using std::integral_constant.  This makes it possible to perform correctness
+// checks at compile time, for example ensuring a handler's signature is compatible
+// with the NVTX API call it is being installed to handle.  Syntax is meant to look
+// familiar.  For example, replace:
+//     NVTX_CBID_CORE_MarkA
+// with:
+//     NVTX_CBID(CORE_MarkA)
+// when passing CBID values to NvtxInjectionHelper::MakeHandlerTable.
+#define NVTX_CBID(func) NvtxInjectionHelper::id_v<decltype(NVTX_CBID_##func), NVTX_CBID_##func>{}
+
+//--- EnumTypeToModuleId ---
+// Template variable to map from call id enum types to module id values (see nvtxTypes.h)
+// For example, EnumTypeToModuleId<NVTX_CBID_CORE_MarkA> == NVTX_CB_MODULE_CORE.
+template <typename EnumT>
+constexpr static NvtxCallbackModule EnumTypeToModuleId = NVTX_CB_MODULE_INVALID;
+
+template<> constexpr NvtxCallbackModule EnumTypeToModuleId<NvtxCallbackIdCore  > = NVTX_CB_MODULE_CORE;
+template<> constexpr NvtxCallbackModule EnumTypeToModuleId<NvtxCallbackIdCuda  > = NVTX_CB_MODULE_CUDA;
+template<> constexpr NvtxCallbackModule EnumTypeToModuleId<NvtxCallbackIdOpenCL> = NVTX_CB_MODULE_OPENCL;
+template<> constexpr NvtxCallbackModule EnumTypeToModuleId<NvtxCallbackIdCudaRt> = NVTX_CB_MODULE_CUDART;
+template<> constexpr NvtxCallbackModule EnumTypeToModuleId<NvtxCallbackIdCore2 > = NVTX_CB_MODULE_CORE2;
+template<> constexpr NvtxCallbackModule EnumTypeToModuleId<NvtxCallbackIdSync  > = NVTX_CB_MODULE_SYNC;
+
+//--- IdToModuleId ---
+// Helper for EnumTypeToModuleId to convert directly from an integral_constant of a call id enum
+// to its module id.  For example, since NVTX_CBID(CORE_MarkA) is an integral_constant, it cannot
+// be used directly as in EnumTypeToModuleId<NVTX_CBID(CORE_MarkA)>, since NVTX_CBID(CORE_MarkA)'s
+// type is std::integral_constant<NvtxCallbackIdCore, NVTX_CBID_CORE_MarkA>.  This helper extracts
+// the enum's type from the integral_constant, allowing EnumConstToModuleId<NVTX_CBID(CORE_MarkA)>.
+template <typename IdT>
+constexpr static NvtxCallbackModule IdToModuleId = EnumTypeToModuleId<typename IdT::value_type>;
+
+
+//--- IdToHandlerType
+// Template using to map from call id values to matching function pointer types.
+template <typename IdT> struct IdToHandlerType { using type = nullptr_t; };
+
+// Macro for defining IdToHandlerType specializations for each id.
+// mod = module, i.e. CORE, CORE2
+// func = prefixless function name, i.e. MarkEx, DomainCreateA
+// impl = impl or fakeimpl, depending on whether or not to use real types or the
+//        nvtxTypes.h "fakeimpl" types, which don't depend on CUDA/OpenCL headers.
+#define NVTX_ID_TO_TYPE(mod, func, impl) \
+template <> struct IdToHandlerType<decltype(NVTX_CBID(mod##_##func))> { using type = nvtx##func##_##impl##_fntype; }
+
+NVTX_ID_TO_TYPE(CORE, MarkEx       , impl);
+NVTX_ID_TO_TYPE(CORE, MarkA        , impl);
+NVTX_ID_TO_TYPE(CORE, MarkW        , impl);
+NVTX_ID_TO_TYPE(CORE, RangeStartEx , impl);
+NVTX_ID_TO_TYPE(CORE, RangeStartA  , impl);
+NVTX_ID_TO_TYPE(CORE, RangeStartW  , impl);
+NVTX_ID_TO_TYPE(CORE, RangeEnd     , impl);
+NVTX_ID_TO_TYPE(CORE, RangePushEx  , impl);
+NVTX_ID_TO_TYPE(CORE, RangePushA   , impl);
+NVTX_ID_TO_TYPE(CORE, RangePushW   , impl);
+NVTX_ID_TO_TYPE(CORE, RangePop     , impl);
+NVTX_ID_TO_TYPE(CORE, NameCategoryA, impl);
+NVTX_ID_TO_TYPE(CORE, NameCategoryW, impl);
+NVTX_ID_TO_TYPE(CORE, NameOsThreadA, impl);
+NVTX_ID_TO_TYPE(CORE, NameOsThreadW, impl);
+
+NVTX_ID_TO_TYPE(CORE2, DomainMarkEx         , impl);
+NVTX_ID_TO_TYPE(CORE2, DomainRangeStartEx   , impl);
+NVTX_ID_TO_TYPE(CORE2, DomainRangeEnd       , impl);
+NVTX_ID_TO_TYPE(CORE2, DomainRangePushEx    , impl);
+NVTX_ID_TO_TYPE(CORE2, DomainRangePop       , impl);
+NVTX_ID_TO_TYPE(CORE2, DomainResourceCreate , impl);
+NVTX_ID_TO_TYPE(CORE2, DomainResourceDestroy, impl);
+NVTX_ID_TO_TYPE(CORE2, DomainNameCategoryA  , impl);
+NVTX_ID_TO_TYPE(CORE2, DomainNameCategoryW  , impl);
+NVTX_ID_TO_TYPE(CORE2, DomainRegisterStringA, impl);
+NVTX_ID_TO_TYPE(CORE2, DomainRegisterStringW, impl);
+NVTX_ID_TO_TYPE(CORE2, DomainCreateA        , impl);
+NVTX_ID_TO_TYPE(CORE2, DomainCreateW        , impl);
+NVTX_ID_TO_TYPE(CORE2, DomainDestroy        , impl);
+NVTX_ID_TO_TYPE(CORE2, Initialize           , impl);
+
+#undef NVTX_ID_TO_TYPE
+
+//--- CheckHandlerTypeMatchesId ---
+// Compile-time check provides easy-to-read error if FuncT isn't compatible with EnumT
+template <typename IdT, typename FuncT>
+constexpr inline void CheckHandlerTypeMatchesId()
+{
+    using ExpectedFuncT = typename IdToHandlerType<IdT>::type;
+
+    static_assert(std::is_same<ExpectedFuncT, FuncT>(),
+        "NVTX Injection Helper: The provided handler function's signature does not match the NVTX API for the given call id.");
+}
+
+//--- Handler ---
+// Represents id/handler pair for an NVTX call.  Provides:
+//    - the call's id (NVTX_CBID_* enum values)
+//    - handler function pointer
+// Preserves the type of the function as a template parameter.
+// Erases the type of the enum, so it's not module-specific anymore.
+// Allows being constructed and placed into a container at compile time, then
+// later at run time doing the run-time-only cast of the function pointer.
+// This enables processing of ids to occur at compile time.
+template <typename FuncT>
+class Handler
+{
+public:
+    id_t id;
+    FuncT pfn;
+
+    template <typename EnumT, EnumT EnumVal>
+    constexpr Handler(id_v<EnumT, EnumVal> e, FuncT pfn_)
+        : id(static_cast<id_t>(EnumVal)) // Erase enum's type
+        , pfn(pfn_)
+    {}
+
+    NvtxFunctionPointer Address() const noexcept
+    {
+        return reinterpret_cast<NvtxFunctionPointer>(pfn);
+    }
+};
+
+//--- MakeHandler ---
+// "Make" function for Handler to automatically deduce types from parameters
+template <typename IdT, typename FuncT>
+constexpr inline Handler<FuncT> MakeHandler(IdT id_, FuncT func)
+{
+    CheckHandlerTypeMatchesId<IdT, FuncT>();
+    return Handler<FuncT>(id_, func);
+}
+
+//--- ModuleHandlerTable ---
+// Represents the set of Handlers for one module.  Provides:
+//    - the module's id (NVTX_CB_MODULE_* enum values)
+//    - iterable container of id/handler pairs (empty means skip getting etbl for module)
+//    - highest call id value of handler in module (to confirm client has sufficient size)
+//    - a method to assign all the stored handlers into a client's handler table
+// These objects can be constructed at compile time, including the highest call id used.
+template <NvtxCallbackModule mod, typename... Funcs>
+class ModuleHandlerTable
+{
+public:
+    using tuple_t = std::tuple<Handler<Funcs>...>;
+
+    static constexpr NvtxCallbackModule moduleId = mod;
+    tuple_t handlers;
+    id_t highestIdUsed;
+
+    constexpr ModuleHandlerTable(tuple_t t)
+        : handlers(t)
+        , highestIdUsed(FindHighestId(t))
+    {}
+
+    void AssignToClient(NvtxFunctionTable clientTable) const noexcept
+    {
+        for_each_in_tuple(handlers,
+            [clientTable](auto const& handler)
+            {
+                if (handler.id != 0 && handler.pfn != nullptr)
+                {
+                    *clientTable[handler.id] = handler.Address();
+                }
+            }
+        );
+    }
+
+private:
+    template <size_t... Is>
+    static constexpr id_t FindHighestIdHelper(tuple_t t, std::index_sequence<Is...>)
+    {
+        return maxVal<id_t>(std::get<Is>(t).id...);
+    }
+
+    static constexpr id_t FindHighestId(tuple_t t)
+    {
+        return FindHighestIdHelper(t, std::make_index_sequence<sizeof...(Funcs)>());
+    }
+};
+
+
+//--- MakeModuleHandlerTuple ---
+// MakeModuleHandlerTuple takes NvtxCallbackModule "mod" as a template parameter,
+// and loops over pairs of arguments (an enum and a handler function), building a
+// tuple of Handler objects for the enums that are in module "mod", and ignoring
+// ones that aren't.  This lets the user pass in handlers for for all modules in
+// one simple call, and we can build up separate handler tables for each module.
+// MakeModuleHandlerTuple is recursive, peeling off two arguments in each recursive
+// case, and having no args be the base case.  The recursive case has a pair of
+// overloads for whether or not the enum's type matches "mod" or not.  Since these
+// overloads are separate functions, it's mutual recursion, so both are declared
+// first before the definitions.
+
+// Base case: no more arguments
+template <NvtxCallbackModule mod>
+constexpr inline auto MakeModuleHandlerTuple()
+{
+    return std::tuple<>{};
+}
+
+// Prototypes of recursive cases -- needed since they can call each other
+template <NvtxCallbackModule mod, typename IdT, typename FuncT,
+    std::enable_if_t<IdToModuleId<IdT> == mod, int> = 0,
+    typename... Args>
+constexpr inline auto MakeModuleHandlerTuple(IdT, FuncT, Args...);
+
+template <NvtxCallbackModule mod, typename IdT, typename FuncT,
+    std::enable_if_t<IdToModuleId<IdT> != mod, int> = 0,
+    typename... Args>
+constexpr inline auto MakeModuleHandlerTuple(IdT, FuncT, Args...);
+
+// Recursive case 1: enum's type matches mod, so add it to the tuple
+template <NvtxCallbackModule mod, typename IdT, typename FuncT,
+    std::enable_if_t<IdToModuleId<IdT> == mod, int>,
+    typename... Args>
+constexpr inline auto MakeModuleHandlerTuple(IdT id, FuncT f, Args... rest)
+{
+    // Verify types of id and function, using static_assert to provide a
+    // clear compile error if the types don't meet the requirements.
+    static_assert(IdToModuleId<IdT> != NVTX_CB_MODULE_INVALID,
+        "MakeHandlerTable arguments must be pairs of IDs and handler functions.  IDs must be enums starting with NVTX_CBID_.  An invalid ID value was provided.");
+
+    // Before adding this id/handler pair to the tuple, check to make sure
+    // there's not already an entry in the tuple with the same id.  If so,
+    // provide a clear compile-time error message.
+    auto restTuple = MakeModuleHandlerTuple<mod>(rest...);
+
+    return std::tuple_cat(
+        std::make_tuple(MakeHandler(id, f)),
+        restTuple);
+}
+
+// Recursive case 2: id is not in module, so fwd result from remaining args
+template <NvtxCallbackModule mod, typename IdT, typename FuncT,
+    std::enable_if_t<IdToModuleId<IdT> != mod, int>,
+    typename... Args>
+constexpr inline auto MakeModuleHandlerTuple(IdT id, FuncT f, Args... rest)
+{
+    return MakeModuleHandlerTuple<mod>(rest...);
+}
+
+//--- MakeModuleHandlerFromTuple ---
+// Helper function for MakeModuleHandlerTable.  Coverts type of Handlers into
+// a ModuleHandlerTable object.  This approach was simpler than building up the
+// ModuleHandlerTable incrementally, since std::tuple_cat makes it so easy to
+// build up a tuple.
+template <NvtxCallbackModule mod, typename... Funcs>
+constexpr inline auto MakeModuleHandlerFromTuple(std::tuple<Handler<Funcs>...> t)
+{
+    return ModuleHandlerTable<mod, Funcs...>(t);
+}
+
+//--- "Make" function for ModuleHandlerTable to automatically deduce type ---
+// First, create a tuple of just the handlers in the argument list in module "mod".
+// Uses the mutually-recursive MakeModuleHandlerTuple overloads, which only add
+// handlers into the tuple if the module matches.  Then, MakeModuleHandlerFromTuple
+// converts the tuple into a properly-typed ModuleHandlerTable object.
+template <NvtxCallbackModule mod, typename... Args>
+constexpr inline auto MakeModuleHandlerTable(Args... args)
+{
+    const auto handlerTuple = MakeModuleHandlerTuple<mod>(args...);
+    return MakeModuleHandlerFromTuple<mod>(handlerTuple);
+}
+
+} // namespace detail_nvtx
+
+//============ NVTX injection helper public interface =========================
+
+// Define sentinel-value constants for use in handler implementations
+namespace ReturnCodes {
+    constexpr auto NVTX_TOOL_ATTACHED_UNUSED_RANGE_ID = static_cast<nvtxRangeId_t>(-1LL);
+    constexpr int  NVTX_TOOL_ATTACHED_UNUSED_PUSH_POP_ID = -1;
+    const     auto NVTX_TOOL_ATTACHED_UNUSED_DOMAIN_HANDLE = reinterpret_cast<nvtxDomainHandle_t>(-1LL);
+    const     auto NVTX_TOOL_ATTACHED_UNUSED_STRING_HANDLE = reinterpret_cast<nvtxStringHandle_t>(-1LL);
+    // Note: In C++20, use bit_cast instead of reinterpret_cast, so the handles
+    // (which are pointer types) can also be made constexpr.
+}
+
+template <typename... Args>
+constexpr inline auto MakeHandlerTable(Args... args)
+{
+    return std::make_tuple(
+        MakeModuleHandlerTable<NVTX_CB_MODULE_CORE  >(args...),
+        MakeModuleHandlerTable<NVTX_CB_MODULE_CUDA  >(args...),
+        MakeModuleHandlerTable<NVTX_CB_MODULE_OPENCL>(args...),
+        MakeModuleHandlerTable<NVTX_CB_MODULE_CUDART>(args...),
+        MakeModuleHandlerTable<NVTX_CB_MODULE_CORE2 >(args...),
+        MakeModuleHandlerTable<NVTX_CB_MODULE_SYNC  >(args...)
+    );
+}
+
+enum class InstallResult
+{
+    Success,
+    ExportTableVersionInfoMissing,
+    ExportTableVersionInfoTooSmall,
+    ClientVersionTooOld,
+    ExportTableCallbacksMissing,
+    ExportTableCallbacksTooSmall,
+    ModuleNotSupported,
+    ModuleTableTooSmall
+};
+
+template <typename HandlerTableT>
+inline InstallResult InstallHandlers(
+    NvtxGetExportTableFunc_t getExportTable,
+    HandlerTableT const& injectionHandlerTable,
+    std::ostringstream* errStream = nullptr,
+    uint32_t* pVersion = nullptr)
+{
+    uint32_t version = 0;
+    auto pVersionInfo =
+        reinterpret_cast<const NvtxExportTableVersionInfo*>(getExportTable(NVTX_ETID_VERSIONINFO));
+    if (!pVersionInfo)
+    {
+        if (errStream) *errStream
+            << "Client NVTX instance doesn't support NVTX_ETID_VERSIONINFO";
+        return InstallResult::ExportTableVersionInfoMissing;
+    }
+
+    if (pVersionInfo->struct_size < sizeof(*pVersionInfo))
+    {
+        if (errStream) *errStream
+            << "NvtxExportTableVersionInfo structure size is " << pVersionInfo->struct_size
+            << ", expected " << sizeof(*pVersionInfo) << "!";
+        return InstallResult::ExportTableVersionInfoTooSmall;
+    }
+
+    version = pVersionInfo->version;
+    if (version < 2)
+    {
+        if (errStream) *errStream
+            << "client's NVTX version is " << version << ", expected 2+";
+        return InstallResult::ClientVersionTooOld;
+    }
+
+    if (pVersion) *pVersion = version;
+
+    auto pCallbacks =
+        reinterpret_cast<const NvtxExportTableCallbacks*>(getExportTable(NVTX_ETID_CALLBACKS));
+    if (!pCallbacks)
+    {
+        if (errStream) *errStream
+            << "Client NVTX instance doesn't support NVTX_ETID_CALLBACKS";
+        return InstallResult::ExportTableCallbacksMissing;
+    }
+
+    if (pCallbacks->struct_size < sizeof(*pCallbacks))
+    {
+        if (errStream) *errStream
+            << "NvtxExportTableCallbacks structure size is " << pCallbacks->struct_size
+            << ", expected " << sizeof(*pCallbacks) << "!";
+        return InstallResult::ExportTableCallbacksTooSmall;
+    }
+
+#if defined(DEBUG) || true
+    // Simple loop to print handler table internal details
+    for_each_in_tuple(injectionHandlerTable,
+        [](auto const& handlerModule)
+        {
+            auto count = size_of_tuple(handlerModule.handlers);
+            printf("Module: %d   Count: %d  Highest: %d\n",
+                (int)handlerModule.moduleId, (int)count, (int)handlerModule.highestIdUsed);
+
+            if (count > 0)
+            {
+                for_each_in_tuple(handlerModule.handlers,
+                    [](auto const& handler)
+                    {
+                        auto addr = (long long)handler.Address();
+                        printf("    Id: %d  Address: 0x%llx\n",
+                            (int)handler.id, addr);
+                    }
+                );
+            }
+        }
+    );
+#endif
+
+    // Loop over module handler tables and install handlers into client
+    bool errors = false;
+    for_each_in_tuple(injectionHandlerTable,
+        [&](auto const& handlerModule)
+        {
+            NvtxFunctionTable clientTable = 0;
+            unsigned int clientTableSize = 0;
+            int success;
+
+            if (handlerModule.moduleId == NVTX_CB_MODULE_INVALID) return;
+
+            success = pCallbacks->GetModuleFunctionTable(handlerModule.moduleId, &clientTable, &clientTableSize);
+            if (!success || !clientTable)
+            {
+                if (errStream) *errStream
+                    << "Client NVTX instance doesn't support callback module with id " << handlerModule.moduleId;
+                // TODO: return InstallResult::ModuleNotSupported;
+                errors = true;
+            }
+
+            // Ensure client's table is new enough to support the function pointers we want to register
+            if (clientTableSize <= handlerModule.highestIdUsed)
+            {
+                if (errStream) *errStream
+                    << "Size of client NVTX instance's handler table with module id " << handlerModule.moduleId
+                    << " too small.  Size is " << clientTableSize
+                    << ", but injection needs to assign table[" << handlerModule.highestIdUsed << "]";
+                // TODO: return InstallResult::ModuleTableTooSmall;
+                errors = true;
+            }
+
+            handlerModule.AssignToClient(clientTable);
+        }
+    );
+
+    if (errors) return InstallResult::ModuleNotSupported;
+
+    return InstallResult::Success;
+}
+
+} // namespace NvtxInjectionHelper
diff --git a/tests/LinkerDupesFileA.cpp b/tests/LinkerDupesFileA.cpp
new file mode 100644
index 0000000..819ead8
--- /dev/null
+++ b/tests/LinkerDupesFileA.cpp
@@ -0,0 +1,26 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include "TestCoverage.h"
+
+void FileA(int argc, const char** argv)
+{
+    RunTestCommon(argc, argv);
+}
diff --git a/tests/LinkerDupesFileB.cpp b/tests/LinkerDupesFileB.cpp
new file mode 100644
index 0000000..10427d3
--- /dev/null
+++ b/tests/LinkerDupesFileB.cpp
@@ -0,0 +1,26 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include "TestCoverage.h"
+
+void FileB(int argc, const char** argv)
+{
+    RunTestCommon(argc, argv);
+}
diff --git a/tests/LinkerDupesMain.cpp b/tests/LinkerDupesMain.cpp
new file mode 100644
index 0000000..69913cf
--- /dev/null
+++ b/tests/LinkerDupesMain.cpp
@@ -0,0 +1,35 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <nvtx3/nvToolsExt.h> // Just for export macros
+
+void FileA(int argc, const char** argv);
+void FileB(int argc, const char** argv);
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    FileA(argc, argv);
+    FileB(argc, argv);
+
+    return 0;
+}
diff --git a/tests/NamedCategories.cpp b/tests/NamedCategories.cpp
new file mode 100644
index 0000000..039110a
--- /dev/null
+++ b/tests/NamedCategories.cpp
@@ -0,0 +1,148 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#if defined(_MSC_VER) && _MSC_VER < 1914
+#define STATIC_ASSERT_TESTING 0
+#else
+#define STATIC_ASSERT_TESTING 1
+#endif
+
+#if defined(STATIC_ASSERT_TESTING)
+#include <stdio.h>
+#define NVTX3_STATIC_ASSERT(c, m) do { if (!(c)) printf("static_assert would fail: %s\n", m); } while (0)
+#endif
+
+#include <nvtx3/nvtx3.hpp>
+
+#include <iostream>
+
+// Domain description types
+struct d { static constexpr const char*    name{"Test domain"}; };
+
+// Named category types
+struct cat_char_test          { static constexpr const char*    name{"Cat char"};     static constexpr uint32_t id{1};  };
+struct cat_wchar_test         { static constexpr const wchar_t* name{L"Cat wchar_t"}; static constexpr uint32_t id{2};  };
+struct error_name_missing     { static constexpr const char*    x   {"Name"};         static constexpr uint32_t id{3};  };
+struct error_name_is_bad_type { static constexpr const int      name{5};              static constexpr uint32_t id{4};  };
+struct error_id_missing       { static constexpr const char*    name{"Name"};         static constexpr uint32_t y {5};  };
+struct error_id_is_bad_type   { static constexpr const char*    name{"Name"};         static constexpr float    id{6};  };
+struct error_both_missing     { static constexpr const char*    x   {"Name"};         static constexpr uint32_t y {7};  };
+struct error_both_bad_type    { static constexpr const int      name{5};              static constexpr float    id{8};  };
+struct error_no_name_bad_id   { static constexpr const char*    x   {"Name"};         static constexpr float    id{9};  };
+struct error_bad_name_no_id   { static constexpr const int      name{5};              static constexpr uint32_t y {10}; };
+struct cat_global_domain1     { static constexpr const char*    name{"Global1"};      static constexpr uint32_t id{11}; };
+struct cat_global_domain2     { static constexpr const char*    name{"Global2"};      static constexpr uint32_t id{12}; };
+struct cat_global_domain3     { static constexpr const char*    name{"Global3"};      static constexpr uint32_t id{13}; };
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    (void)argc;
+    (void)argv;
+
+    using namespace nvtx3;
+
+    auto& d1 = domain::get<d>();
+
+#if 1
+    std::cout << "- Named category (char):\n";
+    auto& c1 = named_category_in<d>::get<cat_char_test>();
+    mark_in<d>("Mark in cat_char_test category", named_category_in<d>::get<cat_char_test>());
+
+    std::cout << "- Named category (wchar_t):\n";
+    auto& c2 = named_category_in<d>::get<cat_wchar_test>();
+    mark_in<d>("Mark in cat_wchar_test category", named_category_in<d>::get<cat_wchar_test>());
+#endif
+
+#if 1
+    std::cout << "- Named category in global domain (alias):\n";
+    auto& cd1 = named_category::get<cat_global_domain1>();
+
+    std::cout << "- Named category in global domain (implicit):\n";
+    auto& cd2 = named_category_in<>::get<cat_global_domain2>();
+
+    std::cout << "- Named category in global domain (explicit):\n";
+    auto& cd3 = named_category_in<domain::global>::get<cat_global_domain3>();
+#endif
+
+#if STATIC_ASSERT_TESTING
+
+#if 1 // defined(ERROR_TEST_NAME_IS_MISSING)
+    {
+        std::cout << "- Error test - category is missing name member:\n";
+        auto& c3 = named_category_in<d>::get<error_name_missing>();
+    }
+#endif
+
+#if 1 // defined(ERROR_TEST_NAME_IS_BAD_TYPE)
+    {
+        std::cout << "- Error test - category name member isn't narrow or wide char array:\n";
+        auto& c4 = named_category_in<d>::get<error_name_is_bad_type>();
+    }
+#endif
+
+#if 1 // defined(ERROR_TEST_ID_IS_MISSING)
+    {
+        std::cout << "- Error test - category is missing id member:\n";
+        auto& c5 = named_category_in<d>::get<error_id_missing>();
+    }
+#endif
+
+#if 1 // defined(ERROR_TEST_ID_IS_BAD_TYPE)
+    {
+        std::cout << "- Error test - category id member isn't uint32_t:\n";
+        auto& c6 = named_category_in<d>::get<error_id_is_bad_type>();
+    }
+#endif
+
+#if 1 // defined(ERROR_TEST_BOTH_MISSING)
+    {
+        std::cout << "- Error test - category is missing both members:\n";
+        auto& c7 = named_category_in<d>::get<error_both_missing>();
+    }
+#endif
+
+#if 1 // defined(ERROR_TEST_BOTH_BAD_TYPE)
+    {
+        std::cout << "- Error test - category members are both bad types:\n";
+        auto& c8 = named_category_in<d>::get<error_both_bad_type>();
+    }
+#endif
+
+#if 1 // defined(ERROR_TEST_NO_NAME_BAD_ID)
+    {
+        std::cout << "- Error test - category has no name and bad id type:\n";
+        auto& c9 = named_category_in<d>::get<error_no_name_bad_id>();
+    }
+#endif
+
+#if 1 // defined(ERROR_TEST_BAD_NAME_NO_ID)
+    {
+        std::cout << "- Error test - category has bad name type and no id:\n";
+        auto& c10 = named_category_in<d>::get<error_bad_name_no_id>();
+    }
+#endif
+
+#endif // STATIC_ASSERT_TESTING
+
+    return 0;
+}
diff --git a/tests/PathHelper.h b/tests/PathHelper.h
new file mode 100644
index 0000000..23e08a1
--- /dev/null
+++ b/tests/PathHelper.h
@@ -0,0 +1,214 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#pragma once
+
+/* Dynamic libraries should be loaded with absolute paths to avoid
+ * problems not finding things in the search paths.  Construct the
+ * absolute path to a dynamic library in the same directory as the
+ * process's executable, or some subdirectory of it, using these
+ * utility functions.  C++17's std::filesystem makes this much
+ * easier, but these utilities should work in C++11.
+ */
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+#else
+
+#if defined(__CYGWIN__)
+#if defined(__POSIX_VISIBLE)
+#if __POSIX_VISIBLE < 200112L
+#error On Cygwin, you must `#define _POSIX_C_SOURCE 200112L` or greater before including any headers so that readlink() is available. You can achieve this by including this header before any others.
+#endif
+#endif
+#if defined(_POSIX_C_SOURCE)
+#undef _POSIX_C_SOURCE
+#endif
+#define _POSIX_C_SOURCE 200809L
+#endif
+
+#include <unistd.h>
+
+#endif
+
+#if defined(__APPLE__)
+#include <libproc.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "DllHelper.h"
+
+#if defined(_WIN32)
+constexpr char pathSep = '\\';
+#else
+constexpr char pathSep = '/';
+#endif
+
+// Adapted from C functions in NVTXW implementation
+static std::string GetCurrentProcessPath()
+{
+    char* buf;
+#if defined(_WIN32)
+    {
+        DWORD size = MAX_PATH;
+        DWORD newSize;
+        buf = NULL;
+        while (1)
+        {
+            buf = (char*)realloc(buf, size);
+            if (!buf)
+            {
+                return NULL;
+            }
+            newSize = GetModuleFileNameA(NULL, buf, size);
+            if (newSize < size)
+            {
+                break;
+            }
+            size *= 2;
+        }
+    }
+#elif defined(__APPLE__)
+    {
+        int ret;
+        pid_t pid = getpid();
+        buf = (char*)malloc(PROC_PIDPATHINFO_MAXSIZE);
+        if (!buf)
+        {
+            return NULL;
+        }
+        ret = proc_pidpath(pid, buf, PROC_PIDPATHINFO_MAXSIZE);
+        if (ret == 0)
+        {
+            free(buf);
+            return NULL;
+        }
+    }
+#elif defined(__QNX__)
+    {
+        size_t size = fpathconf(0, _PC_MAX_INPUT);
+        if (size <= 0)
+        {
+            size = 4096;
+        }
+        ++size;
+        buf = (char*)malloc(size);
+        if (!buf)
+        {
+            return NULL;
+        }
+        _cmdname(buf);
+    }
+#else
+    {
+        size_t size = 1024;
+        ssize_t bytesReadSigned;
+        size_t bytesRead;
+        static const char linkName[] = "/proc/self/exe";
+        buf = NULL;
+        while (1)
+        {
+            buf = (char*)realloc(buf, size);
+            if (!buf)
+            {
+                return NULL;
+            }
+            bytesReadSigned = readlink(linkName, buf, size);
+            if (bytesReadSigned < 0)
+            {
+                free(buf);
+                return NULL;
+            }
+            bytesRead = (size_t)bytesReadSigned;
+            if (bytesRead < size) break;
+            size *= 2;
+        }
+        buf[bytesRead] = '\0';
+    }
+#endif
+
+    std::string result;
+    if (buf)
+    {
+        result = buf;
+        free(buf);
+    }
+    return result;
+}
+
+// We know the absolute path must have at least one slash in it,
+// right before the exe filename.  So we can truncate the string
+// to end just after the last slash, and append other file or
+// directory names.  Examples:
+//    C:\path\to\foo.exe -> C:\path\to\
+//    C:\foo.exe -> C:\
+//    /path/to/foo -> /path/to/
+//    /foo -> /
+std::string GetCurrentProcessDirWithSep()
+{
+    std::string exeAbsPath = GetCurrentProcessPath();
+    exeAbsPath.resize(exeAbsPath.find_last_of(pathSep) + 1);
+    return exeAbsPath;
+}
+
+// Take the absolute path to the current process's executable,
+// remove the executable's name, and then append the library
+// filename.  Applies the standard dynamic library prefix and
+// suffix to the library's base name, but the suffix may be
+// overridden if it isn't the standard one (e.g. ".so.1.1").
+// If subDirs has any entries, they are added between the
+// directory and the library name, with path separators added
+// between each.  Examples:
+//   (Assuming process is C:\path\to\foo.exe on Windows)
+//     AbsolutePathToLibraryInCurrentProcessPath("example")
+//       -> C:\path\to\example.dll
+//     AbsolutePathToLibraryInCurrentProcessPath("example", {"nested", "deeper"})
+//       -> C:\path\to\nested\deeper\example.dll
+//   (Assuming process is /path/to/foo on Linux)
+//     AbsolutePathToLibraryInCurrentProcessPath("example")
+//       -> /path/to/libexample.so
+//     AbsolutePathToLibraryInCurrentProcessPath("example", {"nested", "deeper"}, ".so.1")
+//       -> /path/to/nested/deeper/libexample.so.1
+std::string AbsolutePathToLibraryInCurrentProcessPath(
+    std::string libraryBaseName,
+    std::vector<std::string> subDirs = {},
+    std::string libSuffix = DLL_SUFFIX)
+{
+    std::string result = GetCurrentProcessDirWithSep();
+
+    for (auto const& subDir : subDirs)
+    {
+        result += subDir;
+        result += pathSep;
+    }
+
+    result += DLL_PREFIX;
+    result += libraryBaseName;
+    result += libSuffix;
+
+    return result;
+}
diff --git a/tests/PrettyPrintersNvtxC.h b/tests/PrettyPrintersNvtxC.h
new file mode 100644
index 0000000..bdba96a
--- /dev/null
+++ b/tests/PrettyPrintersNvtxC.h
@@ -0,0 +1,160 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#pragma once
+#include <nvtx3/nvToolsExt.h>
+#include <iostream>
+
+// Pretty-printers for color, payload, and message discriminated-union types
+
+inline void WriteColorType(std::ostream& os, nvtxColorType_t t)
+{
+    switch (t)
+    {
+        case NVTX_COLOR_ARGB   : os << "NVTX_COLOR_ARGB"; break;
+        case NVTX_COLOR_UNKNOWN: os << "<UNKNOWN TYPE>";  break;
+        default                : os << "<INVALID TYPE = " << (int32_t)t << ">";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, nvtxColorType_t t)
+{
+    WriteColorType(os, t);
+    return os;
+}
+
+inline void WritePayloadType(std::ostream& os, nvtxPayloadType_t t)
+{
+    switch (t)
+    {
+        case NVTX_PAYLOAD_TYPE_UNSIGNED_INT64: os << "NVTX_PAYLOAD_TYPE_UNSIGNED_INT64"; break;
+        case NVTX_PAYLOAD_TYPE_INT64         : os << "NVTX_PAYLOAD_TYPE_INT64         "; break;
+        case NVTX_PAYLOAD_TYPE_DOUBLE        : os << "NVTX_PAYLOAD_TYPE_DOUBLE        "; break;
+        case NVTX_PAYLOAD_TYPE_UNSIGNED_INT32: os << "NVTX_PAYLOAD_TYPE_UNSIGNED_INT32"; break;
+        case NVTX_PAYLOAD_TYPE_INT32         : os << "NVTX_PAYLOAD_TYPE_INT32         "; break;
+        case NVTX_PAYLOAD_TYPE_FLOAT         : os << "NVTX_PAYLOAD_TYPE_FLOAT         "; break;
+        case NVTX_PAYLOAD_UNKNOWN            : os << "<UNKNOWN TYPE>";                   break;
+        default                              : os << "<INVALID TYPE = " << (int32_t)t << ">";
+    }
+}
+
+inline void WritePayloadValue(std::ostream& os, nvtxPayloadType_t t, nvtxEventAttributes_v2::payload_t val)
+{
+    switch (t)
+    {
+        case NVTX_PAYLOAD_TYPE_UNSIGNED_INT64: os << val.ullValue;      break;
+        case NVTX_PAYLOAD_TYPE_INT64         : os << val.llValue;       break;
+        case NVTX_PAYLOAD_TYPE_DOUBLE        : os << val.dValue;        break;
+        case NVTX_PAYLOAD_TYPE_UNSIGNED_INT32: os << val.uiValue;       break;
+        case NVTX_PAYLOAD_TYPE_INT32         : os << val.iValue;        break;
+        case NVTX_PAYLOAD_TYPE_FLOAT         : os << val.fValue;        break;
+        case NVTX_PAYLOAD_UNKNOWN            : os << "<IGNORED VALUE>"; break;
+        default                              : os << "<INVALID VALUE>";
+    }
+}
+
+inline void WritePayload(std::ostream& os, nvtxPayloadType_t t, nvtxEventAttributes_v2::payload_t val)
+{
+    WritePayloadType(os, t);
+    os << " = ";
+    WritePayloadValue(os, t, val);
+}
+
+inline std::ostream& operator<<(std::ostream& os, nvtxPayloadType_t t)
+{
+    WritePayloadType(os, t);
+    return os;
+}
+
+inline void WriteMessageType(std::ostream& os, nvtxMessageType_t t)
+{
+    switch (t)
+    {
+        case NVTX_MESSAGE_TYPE_ASCII     : os << "NVTX_MESSAGE_TYPE_ASCII";      break;
+        case NVTX_MESSAGE_TYPE_UNICODE   : os << "NVTX_MESSAGE_TYPE_UNICODE";    break;
+        case NVTX_MESSAGE_TYPE_REGISTERED: os << "NVTX_MESSAGE_TYPE_REGISTERED"; break;
+        case NVTX_MESSAGE_UNKNOWN        : os << "<UNKNOWN TYPE>";               break;
+        default                          : os << "<INVALID TYPE = " << (int32_t)t << ">";
+    }
+}
+
+inline void WriteMessageValue(std::ostream& os, nvtxMessageType_t t, nvtxMessageValue_t val)
+{
+    switch (t)
+    {
+        case NVTX_MESSAGE_TYPE_ASCII     : os << val.ascii;             break;
+        case NVTX_MESSAGE_TYPE_UNICODE   : os << "<Some wide chars>";   break;
+        case NVTX_MESSAGE_TYPE_REGISTERED: os << "Registered handle: " << (void*)val.registered; break;
+        case NVTX_MESSAGE_UNKNOWN        : os << "<IGNORED VALUE>";     break;
+        default                          : os << "<INVALID VALUE>";
+    }
+}
+
+inline void WriteMessage(std::ostream& os, nvtxMessageType_t t, nvtxMessageValue_t val)
+{
+    WriteMessageType(os, t);
+    os << " = ";
+    WriteMessageValue(os, t, val);
+}
+
+inline std::ostream& operator<<(std::ostream& os, nvtxMessageType_t t)
+{
+    WriteMessageType(os, t);
+    return os;
+}
+
+// Pretty-printer for attributes struct
+
+#if 1
+inline std::ostream& operator<<(std::ostream& os, nvtxEventAttributes_t const& a)
+{
+    os << "{ver: " << a.version
+        << ", size: " << a.size
+        << ", category: " << a.category
+        << ", color: " << (nvtxColorType_t)a.colorType << " 0x" << std::hex << a.color << std::dec
+        << ", payload: " << (nvtxPayloadType_t)a.payloadType << " ";
+    WritePayloadValue(os, (nvtxPayloadType_t)a.payloadType, a.payload);
+    os << ", message: " << (nvtxMessageType_t)a.messageType << " \"";
+    WriteMessageValue(os, (nvtxMessageType_t)a.messageType, a.message);
+    os << "\"}";
+
+    return os;
+}
+#else
+inline std::ostream& operator<<(std::ostream& os, nvtxEventAttributes_t const& a)
+{
+    os
+        << "uint16_t version = " << a.version << "\n"
+        << "uint16_t size = " << a.size << "\n"
+        << "uint32_t category = " << a.category << "\n"
+        << "int32_t colorType = " << (nvtxColorType_t)a.colorType << "\n"
+        << "uint32_t color = 0x" << std::hex << a.color << std::dec << "\n"
+        << "int32_t payloadType = " << (nvtxPayloadType_t)a.payloadType << "\n"
+        << "(union) payload = ";
+    WritePayloadValue(os, (nvtxPayloadType_t)a.payloadType, a.payload);
+    os << "\n"
+        << "int32_t messageType = " << (nvtxMessageType_t)a.messageType << "\n"
+        << "(union) message = ";
+    WriteMessageValue(os, (nvtxMessageType_t)a.messageType, a.message);
+    os << "\n";
+
+    return os;
+}
+#endif
diff --git a/tests/PrettyPrintersNvtxCpp.h b/tests/PrettyPrintersNvtxCpp.h
new file mode 100644
index 0000000..3a09948
--- /dev/null
+++ b/tests/PrettyPrintersNvtxCpp.h
@@ -0,0 +1,40 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#pragma once
+#include "PrettyPrintersNvtxC.h"
+#include <nvtx3/nvtx3.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, nvtx3::event_attributes const& attr)
+{
+    return os << *attr.get();
+}
+
+inline std::ostream& operator<<(std::ostream& os, nvtx3::payload const& p)
+{
+    WritePayload(os, p.get_type(), p.get_value());
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, nvtx3::message const& m)
+{
+    WriteMessage(os, m.get_type(), m.get_value());
+    return os;
+}
diff --git a/tests/PrintInjectionC.c b/tests/PrintInjectionC.c
new file mode 100644
index 0000000..fe876e9
--- /dev/null
+++ b/tests/PrintInjectionC.c
@@ -0,0 +1,269 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#define NVTX_NO_IMPL
+#ifdef SUPPORT_STATIC_INJECTION
+/* Ensure the static injection's init function pointer isn't defined as weak */
+#define NVTX_STATIC_INJECTION_IMPL
+#endif
+#include "nvtx3/nvToolsExt.h"
+
+#include <stdio.h>
+
+/* Use a compiler option to define this prefix string to a custom value */
+#ifndef INJECTION_PRINT_PREFIX
+#define INJECTION_PRINT_PREFIX "inj"
+#endif
+
+#if defined(NVTX_INJECTION_TEST_QUIET)
+#define LOG_INFO(...)
+#define LOG_ERROR(...)
+#else
+#define LOG_INFO(...)  fprintf(stdout, "[" INJECTION_PRINT_PREFIX "] " __VA_ARGS__)
+#define LOG_ERROR(...) fprintf(stdout, "[" INJECTION_PRINT_PREFIX "] ERROR: " __VA_ARGS__)
+#endif
+
+/* Implementations of NVTX functions to attach to client */
+
+#define NVTX_TOOL_ATTACHED_UNUSED_RANGE_ID (nvtxRangeId_t)(-1LL)
+#define NVTX_TOOL_ATTACHED_UNUSED_PUSH_POP_ID (int)(-1)
+#define NVTX_TOOL_ATTACHED_UNUSED_DOMAIN_HANDLE (nvtxDomainHandle_t)(-1LL)
+#define NVTX_TOOL_ATTACHED_UNUSED_STRING_HANDLE (nvtxStringHandle_t)(-1LL)
+
+/* NVTX_CB_MODULE_CORE */
+
+static void NVTX_API HandleMarkA(const char* str)
+{
+    LOG_INFO("%s\n", "nvtxMarkA");
+}
+
+static int NVTX_API HandleRangePushA(const char* str)
+{
+    LOG_INFO("%s\n", "nvtxRangePushA");
+    return NVTX_TOOL_ATTACHED_UNUSED_PUSH_POP_ID;
+}
+
+static int NVTX_API HandleRangePop()
+{
+    LOG_INFO("%s\n", "nvtxRangePop");
+    return NVTX_TOOL_ATTACHED_UNUSED_PUSH_POP_ID;
+}
+
+/* NVTX_CB_MODULE_CORE2 */
+
+static void NVTX_API HandleDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
+{
+    LOG_INFO("%s\n", "nvtxDomainMarkEx");
+}
+
+static nvtxRangeId_t NVTX_API HandleDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
+{
+    LOG_INFO("%s\n", "nvtxDomainRangeStartEx");
+    return NVTX_TOOL_ATTACHED_UNUSED_RANGE_ID;
+}
+
+static void NVTX_API HandleDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id)
+{
+    LOG_INFO("%s\n", "nvtxDomainRangeEnd");
+}
+
+static int NVTX_API HandleDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib)
+{
+    LOG_INFO("%s\n", "nvtxDomainRangePushEx");
+    return NVTX_TOOL_ATTACHED_UNUSED_PUSH_POP_ID;
+}
+
+static int NVTX_API HandleDomainRangePop(nvtxDomainHandle_t domain)
+{
+    LOG_INFO("%s\n", "nvtxDomainRangePop");
+    return NVTX_TOOL_ATTACHED_UNUSED_PUSH_POP_ID;
+}
+
+static nvtxStringHandle_t NVTX_API HandleDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string)
+{
+    LOG_INFO("%s\n", "nvtxDomainRegisterStringA");
+    return NVTX_TOOL_ATTACHED_UNUSED_STRING_HANDLE;
+}
+
+static nvtxDomainHandle_t NVTX_API HandleDomainCreateA(const char* name)
+{
+    LOG_INFO("%s\n", "nvtxDomainCreateA");
+    return NVTX_TOOL_ATTACHED_UNUSED_DOMAIN_HANDLE;
+}
+
+static void NVTX_API HandleDomainDestroy(nvtxDomainHandle_t domain)
+{
+    LOG_INFO("%s\n", "nvtxDomainDestroy");
+}
+
+static void NVTX_API HandleInitialize(const void* reserved)
+{
+    LOG_INFO("%s\n", "nvtxInitialize");
+}
+
+/* To simplify building this injection in various ways to test dynamic/static/preinject
+*  modes, make the initialization function static so it can't be used externally.  Then
+*  provide individual functions/symbols to expose it based on the #defines used. */
+static int NVTX_API InitializeInjectionNvtx2Internal(NvtxGetExportTableFunc_t getExportTable)
+{
+    uint32_t version = 0;
+    const NvtxExportTableVersionInfo* pVersionInfo;
+    const NvtxExportTableCallbacks* pCallbacks;
+    NvtxFunctionTable table = 0;
+    unsigned int size = 0;
+    int success;
+    unsigned int highestIdUsed;
+
+    pVersionInfo = (const NvtxExportTableVersionInfo*)getExportTable(NVTX_ETID_VERSIONINFO);
+    if (pVersionInfo)
+    {
+        if (pVersionInfo->struct_size < sizeof(*pVersionInfo))
+        {
+            LOG_ERROR(
+                "(init v2) NvtxExportTableVersionInfo structure size is %d, expected %d!\n",
+                (int)pVersionInfo->struct_size,
+                (int)sizeof(*pVersionInfo));
+            return 0;
+        }
+
+        version = pVersionInfo->version;
+        if (version < 2)
+        {
+            LOG_ERROR(
+                "(init v2) client's NVTX version is %d, expected 2+\n",
+                (int)version);
+            return 0;
+        }
+    }
+
+    LOG_INFO("---- InitializeInjectionNvtx2 called from client's NVTX v%d\n", version);
+
+    pCallbacks = (const NvtxExportTableCallbacks*)getExportTable(NVTX_ETID_CALLBACKS);
+    if (!pCallbacks)
+    {
+        LOG_ERROR("(init v2) NVTX_ETID_CALLBACKS is not supported.\n");
+        return 0;
+    }
+
+    if (pCallbacks->struct_size < sizeof(*pCallbacks))
+    {
+        LOG_ERROR("(init v2) NvtxExportTableCallbacks structure size is %d, expected %d!\n",
+            (int)pCallbacks->struct_size,
+            (int)sizeof(*pCallbacks));
+        return 0;
+    }
+
+    {
+        table = 0;
+        size = 0;
+        success = pCallbacks->GetModuleFunctionTable(NVTX_CB_MODULE_CORE, &table, &size);
+        if (!success || !table)
+        {
+            LOG_ERROR("(init v2) NVTX_CB_MODULE_CORE is not supported.\n");
+            return 0;
+        }
+
+        /* Ensure client's table is new enough to support the function pointers we want to register */
+        highestIdUsed = NVTX_CBID_CORE_RangePop; /* Can auto-detect this in C++ */
+        if (size <= highestIdUsed)
+        {
+            LOG_ERROR("(init v2) Client's function pointer table size is %d, and we need to assign to table[%d].\n",
+                (int)size,
+                (int)highestIdUsed);
+            return 0;
+        }
+
+        *table[NVTX_CBID_CORE_MarkA     ] = (NvtxFunctionPointer)HandleMarkA     ;
+        *table[NVTX_CBID_CORE_RangePushA] = (NvtxFunctionPointer)HandleRangePushA;
+        *table[NVTX_CBID_CORE_RangePop  ] = (NvtxFunctionPointer)HandleRangePop  ;
+    }
+
+    {
+        table = 0;
+        size = 0;
+        success = pCallbacks->GetModuleFunctionTable(NVTX_CB_MODULE_CORE2, &table, &size);
+        if (!success || !table)
+        {
+            LOG_ERROR("(init v2) NVTX_CB_MODULE_CORE2 is not supported.\n");
+            return 0;
+        }
+
+        /* Ensure client's table is new enough to support the function pointers we want to register */
+        highestIdUsed = NVTX_CBID_CORE2_Initialize; /* Can auto-detect this in C++ */
+        if (size <= highestIdUsed)
+        {
+            LOG_ERROR("(init v2) Client's function pointer table size is %d, and we need to assign to table[%d].\n",
+                (int)size,
+                (int)highestIdUsed);
+            return 0;
+        }
+
+        *table[NVTX_CBID_CORE2_DomainMarkEx         ] = (NvtxFunctionPointer)HandleDomainMarkEx         ;
+        *table[NVTX_CBID_CORE2_DomainRangeStartEx   ] = (NvtxFunctionPointer)HandleDomainRangeStartEx   ;
+        *table[NVTX_CBID_CORE2_DomainRangeEnd       ] = (NvtxFunctionPointer)HandleDomainRangeEnd       ;
+        *table[NVTX_CBID_CORE2_DomainRangePushEx    ] = (NvtxFunctionPointer)HandleDomainRangePushEx    ;
+        *table[NVTX_CBID_CORE2_DomainRangePop       ] = (NvtxFunctionPointer)HandleDomainRangePop       ;
+        *table[NVTX_CBID_CORE2_DomainRegisterStringA] = (NvtxFunctionPointer)HandleDomainRegisterStringA;
+        *table[NVTX_CBID_CORE2_DomainCreateA        ] = (NvtxFunctionPointer)HandleDomainCreateA        ;
+        *table[NVTX_CBID_CORE2_DomainDestroy        ] = (NvtxFunctionPointer)HandleDomainDestroy        ;
+        *table[NVTX_CBID_CORE2_Initialize           ] = (NvtxFunctionPointer)HandleInitialize           ;
+    }
+
+    return 1;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* For dynamic (env-var based) and preinject (LD_PRELOAD based) support, provide
+*  dynamic exports with the appropriate names.  These should be implemented as
+*  tail-calls since the ABI exactly matches the internal implementation function,
+*  but performance is really not a concern since these functions are called once
+*  per client initialization. */
+
+#ifdef SUPPORT_DYNAMIC_INJECTION
+NVTX_DYNAMIC_EXPORT
+int NVTX_API InitializeInjectionNvtx2(NvtxGetExportTableFunc_t getExportTable)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+    return InitializeInjectionNvtx2Internal(getExportTable);
+}
+#endif
+
+#ifdef SUPPORT_PREINJECTION
+/* Note: this mode is not supported by the NVTX loader on Windows */
+NVTX_DYNAMIC_EXPORT
+int NVTX_API InitializeInjectionNvtx2Preinject(NvtxGetExportTableFunc_t getExportTable)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+    return InitializeInjectionNvtx2Internal(getExportTable);
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef SUPPORT_STATIC_INJECTION
+/* Redefine the symbol without using attribute weak. */
+/* Note: this mode is not supported by the NVTX loader on Windows */
+NvtxInitializeInjectionNvtxFunc_t InitializeInjectionNvtx2_fnptr = InitializeInjectionNvtx2Internal;
+#endif
diff --git a/tests/PrintInjectionCpp.cpp b/tests/PrintInjectionCpp.cpp
new file mode 100644
index 0000000..f8da03c
--- /dev/null
+++ b/tests/PrintInjectionCpp.cpp
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include "PrintInjectionC.c"
diff --git a/tests/RegisteredStrings.cpp b/tests/RegisteredStrings.cpp
new file mode 100644
index 0000000..118eeaf
--- /dev/null
+++ b/tests/RegisteredStrings.cpp
@@ -0,0 +1,100 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#if defined(_MSC_VER) && _MSC_VER < 1914
+#define STATIC_ASSERT_TESTING 0
+#else
+#define STATIC_ASSERT_TESTING 1
+#endif
+
+#if defined(STATIC_ASSERT_TESTING)
+#include <stdio.h>
+#define NVTX3_STATIC_ASSERT(c, m) do { if (!(c)) printf("static_assert would fail: %s\n", m); } while (0)
+#endif
+
+#include <nvtx3/nvtx3.hpp>
+
+#include <iostream>
+
+// Domain description types
+struct d { static constexpr const char*    name{"Test domain"}; };
+
+// Registered string types
+struct regstr_char_test      { static constexpr const char*    message{"Reg str char"};     };
+struct regstr_wchar_test     { static constexpr const wchar_t* message{L"Reg str wchar_t"}; };
+struct error_msg_missing     { static constexpr const char*    x      {"Name"}; };
+struct error_msg_is_bad_type { static constexpr const int      message{5}; };
+struct regstr_global_domain1 { static constexpr const char*    message{"Global1"}; };
+struct regstr_global_domain2 { static constexpr const char*    message{"Global2"}; };
+struct regstr_global_domain3 { static constexpr const char*    message{"Global3"}; };
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    (void)argc;
+    (void)argv;
+
+    using namespace nvtx3;
+
+    auto& d1 = domain::get<d>();
+
+#if 1
+    std::cout << "- Registered string (char):\n";
+    auto& r1 = registered_string_in<d>::get<regstr_char_test>();
+    mark_in<d>("Mark in regstr_char_test category", registered_string_in<d>::get<regstr_char_test>());
+
+    std::cout << "- Registered string (wchar_t):\n";
+    auto& r2 = registered_string_in<d>::get<regstr_wchar_test>();
+    mark_in<d>("Mark in regstr_wchar_test category", registered_string_in<d>::get<regstr_wchar_test>());
+#endif
+
+#if 1
+    std::cout << "- Registered string in global domain (alias):\n";
+    auto& rd1 = registered_string::get<regstr_global_domain1>();
+
+    std::cout << "- Registered string in global domain (implicit):\n";
+    auto& rd2 = registered_string_in<>::get<regstr_global_domain2>();
+
+    std::cout << "- Registered string in global domain (explicit):\n";
+    auto& rd3 = registered_string_in<domain::global>::get<regstr_global_domain3>();
+#endif
+
+#if STATIC_ASSERT_TESTING
+
+#if 1 // defined(ERROR_TEST_MSG_IS_MISSING)
+    {
+        std::cout << "- Error test - registered string is missing name member:\n";
+        auto& r3 = registered_string_in<d>::get<error_msg_missing>();
+    }
+#endif
+
+#if 1 // defined(ERROR_TEST_MSG_IS_BAD_TYPE)
+    {
+        std::cout << "- Error test - registered string message member isn't narrow or wide char array:\n";
+        auto& r4 = registered_string_in<d>::get<error_msg_is_bad_type>();
+    }
+#endif
+
+#endif // STATIC_ASSERT_TESTING
+
+    return 0;
+}
diff --git a/tests/RunTest.cpp b/tests/RunTest.cpp
new file mode 100644
index 0000000..01f39be
--- /dev/null
+++ b/tests/RunTest.cpp
@@ -0,0 +1,138 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include "PathHelper.h"
+#include "DllHelper.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+
+static bool SetEnvVar(const char* name, const char* value)
+{
+#if defined(_WIN32)
+    auto result = _putenv_s(name, value);
+#else
+    auto result = setenv(name, value, 1);
+#endif
+    return result == 0;
+}
+
+static int MainInternal(int argc, const char** argv)
+{
+    const std::string testArg("-t");
+    const std::string injectionArg("-i");
+    std::string testName;
+    std::string injectionName;
+
+    auto oldArgv = argv;
+    ++argv;
+    while (*argv)
+    {
+        if      (*argv == testArg     ) { ++argv; if (*argv) testName      = *argv; else return 100; }
+        else if (*argv == injectionArg) { ++argv; if (*argv) injectionName = *argv; else return 101; }
+        else break;
+        ++argv;
+    }
+    argc -= (int)(argv - oldArgv);
+
+    if (testName.empty())
+    {
+        return 103;
+    }
+
+    printf("RunTest:\n");
+
+    std::string test = AbsolutePathToLibraryInCurrentProcessPath(testName);
+    printf("  - Using test:      %s\n", test.c_str());
+
+    std::string injection;
+    if (!injectionName.empty())
+    {
+        const char* injectionVar = (sizeof(void*) == 8)
+            ? "NVTX_INJECTION64_PATH"
+            : "NVTX_INJECTION32_PATH";
+
+        // Passing - for the injection means to use the test library as its own injection
+        injection = (injectionName == "-")
+            ? test
+            : AbsolutePathToLibraryInCurrentProcessPath(injectionName);
+
+        bool success = SetEnvVar(injectionVar, injection.c_str());
+        if (!success) return 102;
+    }
+
+    printf("  - Using injection: %s\n", injection.empty() ? "<none>" : injection.c_str());
+
+    DLL_HANDLE hDll = DLL_OPEN(test.c_str());
+    if (!hDll) return 104;
+
+    using pfnRunTest_t = int(*)(int, const char**);
+
+    auto pfnRunTest = (pfnRunTest_t)GET_DLL_FUNC(hDll, "RunTest");
+    if (!pfnRunTest) return 105;
+
+    int result = pfnRunTest(argc, argv); // Forward remaining args
+    if (result) return result;
+
+    return 0;
+}
+
+int main(int argc, const char** argv)
+{
+    int result = MainInternal(argc, argv);
+    if (result == 0)
+    {
+        printf("RunTest PASSED\n");
+    }
+    else
+    {
+        // For error codes known to this test driver, print useful error descriptions.
+        // Otherwise, rely on test to print information about errors.
+        switch (result)
+        {
+            case 100:
+                puts("RunTest: -t requires an argument, the base name of the library to use as a test");
+                break;
+            case 101:
+                puts("RunTest: -i requires an argument, the base name of the library to use as an injection");
+                break;
+            case 102:
+                puts("RunTest: Failed to set NVTX injection environment variable");
+                break;
+            case 103:
+                puts("RunTest: Missing required argument: -t <base name of library to use as a test>");
+                break;
+            case 104:
+                puts("RunTest: Test library failed to load");
+#ifndef _WIN32
+                printf("    dlerror: %s\n", dlerror());
+#endif
+                break;
+            case 105:
+                puts("RunTest: Test library loaded, but does not export required entry point RunTest");
+                break;
+            default:
+                printf("RunTest FAILED with return code: %d\n", result);
+        }
+    }
+
+    return result;
+}
diff --git a/tests/Same.h b/tests/Same.h
new file mode 100644
index 0000000..5db2e7a
--- /dev/null
+++ b/tests/Same.h
@@ -0,0 +1,186 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <type_traits>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <wchar.h>
+#include <string.h>
+
+//-----------------------------------------------------------------------------------------------
+// Implementations of "Same" function for various types
+//   Provides better comparison capabilities than operator==
+//   - Option for shallow or deep comparision (i.e. pointers vs. what they point at)
+//   - Option for verbose mode, with a custom ostream to write to
+//   - Option to specify name string for what is being compared
+//   - Option for indent depth, so nested comparisons can print unwinding mismatch messages
+//-----------------------------------------------------------------------------------------------
+
+// C++11-compatible SFINAE helpers to choose overloads based on whether a type is complete or not
+template <typename... Ts> struct make_void { typedef void type; };
+template <typename... Ts> using void_t = typename make_void<Ts...>::type;
+template <bool B> using enable_if = typename std::enable_if<B, int>::type;
+template <typename, typename = void> struct is_complete { static constexpr bool value = false; };
+template <typename T> struct is_complete<T, void_t<decltype(sizeof(T))>> { static constexpr bool value = true; };
+
+#define SAME_COMMON_ARGS \
+    bool deep = false, bool verbose = false, const char* name = "<unspecified>", std::ostream& oss = std::cout, int depth = 0
+
+// Test if two objects are the same.  When 'deep' is true, ignore pointer values and only
+// compare pointed-to contents, otherwise behave as operator==.  When 'verbose' is true,
+// print information about differences to 'oss'.  The generic overload only works if there's
+// an operator== and operator<< defined.
+template <typename T>
+inline auto Same(T const& lhs, T const& rhs, SAME_COMMON_ARGS)
+    -> decltype(lhs == rhs, oss << lhs, bool())
+{
+    bool objSame = lhs == rhs;
+    if (verbose && !objSame)
+    {
+        oss << std::string(depth, ' ') << "'" << name << "' different:  values are "
+            << lhs << " and " << rhs
+            // << " (type is " << typeid(lhs).name() << ")"
+            << '\n';
+    }
+    return objSame;
+}
+
+// Generic pointer overload for complete types
+template <typename T, enable_if<is_complete<T>::value> = 0>
+inline bool Same(T* lhs, T* rhs, SAME_COMMON_ARGS)
+{
+    if (deep)
+    {
+        return Same(*lhs, *rhs, deep, verbose, name, oss, depth);
+    }
+    else
+    {
+        bool ptrSame = lhs == rhs;
+        if (verbose && !ptrSame)
+        {
+            oss << std::string(depth, ' ') << "'" << name << "' different:  pointer values are 0x"
+                << static_cast<const void*>(lhs) << " and 0x" << static_cast<const void*>(rhs) << '\n';
+        }
+        return ptrSame;
+    }
+}
+
+// Generic pointer overload for incomplete types
+template <typename T, enable_if<!is_complete<T>::value> = 0>
+inline bool Same(T* lhs, T* rhs, SAME_COMMON_ARGS)
+{
+    // Don't know how to deep-copy incomplete types, so always compare pointers
+    bool ptrSame = lhs == rhs;
+    if (verbose && !ptrSame)
+    {
+        oss << std::string(depth, ' ') << "'" << name << "' different:  pointer values (to incomplete type) are 0x"
+            << static_cast<const void*>(lhs) << " and 0x" << static_cast<const void*>(rhs) << '\n';
+    }
+    return ptrSame;
+}
+
+// Overloads for smart pointers -- in all cases, forward to contained raw pointer.
+// In deep mode the comparison will be on the pointed-at objects, and in non-deep
+// mode the comparison will be on the raw pointer values.
+template <typename T>
+inline bool Same(std::shared_ptr<T> const& lhs, std::shared_ptr<T> const& rhs, SAME_COMMON_ARGS)
+{
+    return Same(lhs.get(), rhs.get(), deep, verbose, name, oss, depth);
+}
+
+template <typename T>
+inline bool Same(std::unique_ptr<T> const& lhs, std::unique_ptr<T> const& rhs, SAME_COMMON_ARGS)
+{
+    return Same(lhs.get(), rhs.get(), deep, verbose, name, oss, depth);
+}
+
+// Overloads for C-style strings (narrow and wide)
+inline bool Same(char const* lhs, char const* rhs, SAME_COMMON_ARGS)
+{
+    if (deep)
+    {
+        bool strSame = strcmp(lhs, rhs) == 0;
+        if (verbose && !strSame)
+        {
+            oss << std::string(depth, ' ') << "'" << name << "' different:  char strings are \""
+                << lhs << "\" and \"" << rhs << "\"\n";
+        }
+        return strSame;
+    }
+    else
+    {
+        bool ptrSame = lhs == rhs;
+        if (verbose && !ptrSame)
+        {
+            oss << std::string(depth, ' ') << "'" << name << "' different:  pointer values are "
+                << static_cast<const void*>(lhs) << " and " << static_cast<const void*>(rhs) << '\n';
+        }
+        return ptrSame;
+    }
+}
+
+inline bool Same(wchar_t const* lhs, wchar_t const* rhs, SAME_COMMON_ARGS)
+{
+    if (deep)
+    {
+        bool strSame = wcscmp(lhs, rhs) == 0;
+        if (verbose && !strSame)
+        {
+            oss << std::string(depth, ' ') << "'" << name << "' different:  wchar_t strings are L\""
+                << "<TODO>" << "\" and L\"" << "<TODO>" << "\"\n";
+        }
+        return strSame;
+    }
+    else
+    {
+        bool ptrSame = lhs == rhs;
+        if (verbose && !ptrSame)
+        {
+            oss << std::string(depth, ' ') << "'" << name << "' different:  pointer values are "
+                << static_cast<const void*>(lhs) << " and " << static_cast<const void*>(rhs) << '\n';
+        }
+        return ptrSame;
+    }
+}
+
+// Helper macros to define Same() overloads (and operators == and !=) for struct and tagged union types
+
+#define MEMBER_SAME(member) Same(lhs.member, rhs.member, deep, verbose, #member, oss, depth + 1)
+#define UNION_MEMBER_SAME(tagField, tagValue, member) (lhs.tagField == tagValue && MEMBER_SAME(member))
+
+#define VERBOSE_PRINT() if (verbose && !same) oss << std::string(depth, ' ') << "'" << name << "' members different\n"
+
+#define EQ_SIG(T)     inline bool operator==(T const& lhs, T const& rhs)
+#define NE_FROM_EQ(T) inline bool operator!=(T const& lhs, T const& rhs) { return !(lhs == rhs); }
+
+#define DEFINE_EQ_NE_DEEP(T)    EQ_SIG(T) { return Same(lhs, rhs, true ); } NE_FROM_EQ(T)
+#define DEFINE_EQ_NE_SHALLOW(T) EQ_SIG(T) { return Same(lhs, rhs, false); } NE_FROM_EQ(T)
+
+#define DEFINE_MEMBER_SAME_1(a)       MEMBER_SAME(a)
+#define DEFINE_MEMBER_SAME_2(a, b)    MEMBER_SAME(a) && DEFINE_MEMBER_SAME_1(b)
+#define DEFINE_MEMBER_SAME_3(a, b, c) MEMBER_SAME(a) && DEFINE_MEMBER_SAME_2(b, c)
+
+#define SAME_SIG(T) inline bool Same(T const& lhs, T const& rhs, SAME_COMMON_ARGS)
+
+#define DEFINE_SAME_0(T)          SAME_SIG(T) {                                                             return true; } DEFINE_EQ_NE_DEEP(T)
+#define DEFINE_SAME_1(T, a)       SAME_SIG(T) { bool same = DEFINE_MEMBER_SAME_1(a);       VERBOSE_PRINT(); return same; } DEFINE_EQ_NE_DEEP(T)
+#define DEFINE_SAME_2(T, a, b)    SAME_SIG(T) { bool same = DEFINE_MEMBER_SAME_2(a, b);    VERBOSE_PRINT(); return same; } DEFINE_EQ_NE_DEEP(T)
+#define DEFINE_SAME_3(T, a, b, c) SAME_SIG(T) { bool same = DEFINE_MEMBER_SAME_3(a, b, c); VERBOSE_PRINT(); return same; } DEFINE_EQ_NE_DEEP(T)
diff --git a/tests/SelfInjection.cpp b/tests/SelfInjection.cpp
new file mode 100644
index 0000000..29ddec2
--- /dev/null
+++ b/tests/SelfInjection.cpp
@@ -0,0 +1,199 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include "SelfInjection.h"
+#include <stdio.h>
+
+#if defined(NVTX_INJECTION_TEST_QUIET)
+#define LOG_ERROR(...)
+#else
+#define LOG_ERROR(...) do { fprintf(stderr, "  [inj] ERROR: " __VA_ARGS__); } while (0)
+#endif
+
+Callbacks g_callbacks;
+
+namespace {
+
+/* NVTX_CB_MODULE_CORE */
+void          NVTX_API HandleMarkEx       (const nvtxEventAttributes_t* eventAttrib) {        g_callbacks.MarkEx       (eventAttrib); }
+void          NVTX_API HandleMarkA        (const char* str                         ) {        g_callbacks.MarkA        (str        ); }
+void          NVTX_API HandleMarkW        (const wchar_t* str                      ) {        g_callbacks.MarkW        (str        ); }
+nvtxRangeId_t NVTX_API HandleRangeStartEx (const nvtxEventAttributes_t* eventAttrib) { return g_callbacks.RangeStartEx (eventAttrib); }
+nvtxRangeId_t NVTX_API HandleRangeStartA  (const char* str                         ) { return g_callbacks.RangeStartA  (str        ); }
+nvtxRangeId_t NVTX_API HandleRangeStartW  (const wchar_t* str                      ) { return g_callbacks.RangeStartW  (str        ); }
+void          NVTX_API HandleRangeEnd     (nvtxRangeId_t id                        ) {        g_callbacks.RangeEnd     (id         ); }
+int           NVTX_API HandleRangePushEx  (const nvtxEventAttributes_t* eventAttrib) { return g_callbacks.RangePushEx  (eventAttrib); }
+int           NVTX_API HandleRangePushA   (const char* str                         ) { return g_callbacks.RangePushA   (str        ); }
+int           NVTX_API HandleRangePushW   (const wchar_t* str                      ) { return g_callbacks.RangePushW   (str        ); }
+int           NVTX_API HandleRangePop     (                                        ) { return g_callbacks.RangePop     (           ); }
+void          NVTX_API HandleNameCategoryA(uint32_t id, const char* str            ) {        g_callbacks.NameCategoryA(id, str    ); }
+void          NVTX_API HandleNameCategoryW(uint32_t id, const wchar_t* str         ) {        g_callbacks.NameCategoryW(id, str    ); }
+void          NVTX_API HandleNameOsThreadA(uint32_t id, const char* str            ) {        g_callbacks.NameOsThreadA(id, str    ); }
+void          NVTX_API HandleNameOsThreadW(uint32_t id, const wchar_t* str         ) {        g_callbacks.NameOsThreadW(id, str    ); }
+
+/* NVTX_CB_MODULE_CORE2 */
+void                 NVTX_API HandleDomainMarkEx         (nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib) {        g_callbacks.DomainMarkEx         (domain, eventAttrib); }
+nvtxRangeId_t        NVTX_API HandleDomainRangeStartEx   (nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib) { return g_callbacks.DomainRangeStartEx   (domain, eventAttrib); }
+void                 NVTX_API HandleDomainRangeEnd       (nvtxDomainHandle_t domain, nvtxRangeId_t id                        ) {        g_callbacks.DomainRangeEnd       (domain, id         ); }
+int                  NVTX_API HandleDomainRangePushEx    (nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib) { return g_callbacks.DomainRangePushEx    (domain, eventAttrib); }
+int                  NVTX_API HandleDomainRangePop       (nvtxDomainHandle_t domain                                          ) { return g_callbacks.DomainRangePop       (domain             ); }
+nvtxResourceHandle_t NVTX_API HandleDomainResourceCreate (nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attr          ) { return g_callbacks.DomainResourceCreate (domain, attr       ); }
+void                 NVTX_API HandleDomainResourceDestroy(nvtxResourceHandle_t attr                                          ) {        g_callbacks.DomainResourceDestroy(attr               ); }
+void                 NVTX_API HandleDomainNameCategoryA  (nvtxDomainHandle_t domain, uint32_t id, const char* str            ) {        g_callbacks.DomainNameCategoryA  (domain, id, str    ); }
+void                 NVTX_API HandleDomainNameCategoryW  (nvtxDomainHandle_t domain, uint32_t id, const wchar_t* str         ) {        g_callbacks.DomainNameCategoryW  (domain, id, str    ); }
+nvtxStringHandle_t   NVTX_API HandleDomainRegisterStringA(nvtxDomainHandle_t domain, const char* str                         ) { return g_callbacks.DomainRegisterStringA(domain, str        ); }
+nvtxStringHandle_t   NVTX_API HandleDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* str                      ) { return g_callbacks.DomainRegisterStringW(domain, str        ); }
+nvtxDomainHandle_t   NVTX_API HandleDomainCreateA        (const char* name                                                   ) { return g_callbacks.DomainCreateA        (name               ); }
+nvtxDomainHandle_t   NVTX_API HandleDomainCreateW        (const wchar_t* name                                                ) { return g_callbacks.DomainCreateW        (name               ); }
+void                 NVTX_API HandleDomainDestroy        (nvtxDomainHandle_t domain                                          ) {        g_callbacks.DomainDestroy        (domain             ); }
+void                 NVTX_API HandleInitialize           (const void* reserved                                               ) {        g_callbacks.Initialize           (reserved           ); }
+
+}
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int NVTX_API InitializeInjectionNvtx2(NvtxGetExportTableFunc_t getExportTable)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    uint32_t version = 0;
+    const NvtxExportTableVersionInfo* pVersionInfo =
+        (const NvtxExportTableVersionInfo*)getExportTable(NVTX_ETID_VERSIONINFO);
+    if (pVersionInfo)
+    {
+        if (pVersionInfo->struct_size < sizeof(*pVersionInfo))
+        {
+            LOG_ERROR(
+                "(init v2) NvtxExportTableVersionInfo structure size is %d, expected %d!\n",
+                (int)pVersionInfo->struct_size,
+                (int)sizeof(*pVersionInfo));
+            g_callbacks.Load(0);
+            return 0;
+        }
+
+        version = pVersionInfo->version;
+        if (version < 2)
+        {
+            LOG_ERROR(
+                "(init v2) client's NVTX version is %d, expected 2+\n",
+                (int)version);
+            g_callbacks.Load(0);
+            return 0;
+        }
+    }
+
+    const NvtxExportTableCallbacks* pCallbacks =
+        (const NvtxExportTableCallbacks*)getExportTable(NVTX_ETID_CALLBACKS);
+    if (!pCallbacks)
+    {
+        LOG_ERROR("(init v2) NVTX_ETID_CALLBACKS is not supported.\n");
+        g_callbacks.Load(0);
+        return 0;
+    }
+
+    if (pCallbacks->struct_size < sizeof(*pCallbacks))
+    {
+        LOG_ERROR("(init v2) NvtxExportTableCallbacks structure size is %d, expected %d!\n",
+            (int)pCallbacks->struct_size,
+            (int)sizeof(*pCallbacks));
+        g_callbacks.Load(0);
+        return 0;
+    }
+
+    {
+        NvtxFunctionTable table = 0;
+        unsigned int size = 0;
+        int success = pCallbacks->GetModuleFunctionTable(NVTX_CB_MODULE_CORE, &table, &size);
+        if (!success || !table)
+        {
+            LOG_ERROR("(init v2) NVTX_CB_MODULE_CORE is not supported.\n");
+            g_callbacks.Load(0);
+            return 0;
+        }
+
+        /* Ensure client's table is new enough to support the function pointers we want to register */
+        unsigned int highestIdUsed = NVTX_CBID_CORE_RangePop; /* Can auto-detect this in C++ */
+        if (size <= highestIdUsed)
+        {
+            LOG_ERROR("(init v2) Client's function pointer table size is %d, and we need to assign to table[%d].\n",
+                (int)size,
+                (int)highestIdUsed);
+            g_callbacks.Load(0);
+            return 0;
+        }
+
+        *table[NVTX_CBID_CORE_MarkEx       ] = (NvtxFunctionPointer)HandleMarkEx       ;
+        *table[NVTX_CBID_CORE_MarkA        ] = (NvtxFunctionPointer)HandleMarkA        ;
+        *table[NVTX_CBID_CORE_MarkW        ] = (NvtxFunctionPointer)HandleMarkW        ;
+        *table[NVTX_CBID_CORE_RangeStartEx ] = (NvtxFunctionPointer)HandleRangeStartEx ;
+        *table[NVTX_CBID_CORE_RangeStartA  ] = (NvtxFunctionPointer)HandleRangeStartA  ;
+        *table[NVTX_CBID_CORE_RangeStartW  ] = (NvtxFunctionPointer)HandleRangeStartW  ;
+        *table[NVTX_CBID_CORE_RangeEnd     ] = (NvtxFunctionPointer)HandleRangeEnd     ;
+        *table[NVTX_CBID_CORE_RangePushEx  ] = (NvtxFunctionPointer)HandleRangePushEx  ;
+        *table[NVTX_CBID_CORE_RangePushA   ] = (NvtxFunctionPointer)HandleRangePushA   ;
+        *table[NVTX_CBID_CORE_RangePushW   ] = (NvtxFunctionPointer)HandleRangePushW   ;
+        *table[NVTX_CBID_CORE_RangePop     ] = (NvtxFunctionPointer)HandleRangePop     ;
+        *table[NVTX_CBID_CORE_NameCategoryA] = (NvtxFunctionPointer)HandleNameCategoryA;
+        *table[NVTX_CBID_CORE_NameCategoryW] = (NvtxFunctionPointer)HandleNameCategoryW;
+        *table[NVTX_CBID_CORE_NameOsThreadA] = (NvtxFunctionPointer)HandleNameOsThreadA;
+        *table[NVTX_CBID_CORE_NameOsThreadW] = (NvtxFunctionPointer)HandleNameOsThreadW;
+    }
+
+    {
+        NvtxFunctionTable table = 0;
+        unsigned int size = 0;
+        int success = pCallbacks->GetModuleFunctionTable(NVTX_CB_MODULE_CORE2, &table, &size);
+        if (!success || !table)
+        {
+            LOG_ERROR("(init v2) NVTX_CB_MODULE_CORE2 is not supported.\n");
+            g_callbacks.Load(0);
+            return 0;
+        }
+
+        /* Ensure client's table is new enough to support the function pointers we want to register */
+        unsigned int highestIdUsed = NVTX_CBID_CORE2_Initialize; /* Can auto-detect this in C++ */
+        if (size <= highestIdUsed)
+        {
+            LOG_ERROR("(init v2) Client's function pointer table size is %d, and we need to assign to table[%d].\n",
+                (int)size,
+                (int)highestIdUsed);
+            g_callbacks.Load(0);
+            return 0;
+        }
+
+        *table[NVTX_CBID_CORE2_DomainMarkEx         ] = (NvtxFunctionPointer)HandleDomainMarkEx         ;
+        *table[NVTX_CBID_CORE2_DomainRangeStartEx   ] = (NvtxFunctionPointer)HandleDomainRangeStartEx   ;
+        *table[NVTX_CBID_CORE2_DomainRangeEnd       ] = (NvtxFunctionPointer)HandleDomainRangeEnd       ;
+        *table[NVTX_CBID_CORE2_DomainRangePushEx    ] = (NvtxFunctionPointer)HandleDomainRangePushEx    ;
+        *table[NVTX_CBID_CORE2_DomainRangePop       ] = (NvtxFunctionPointer)HandleDomainRangePop       ;
+        *table[NVTX_CBID_CORE2_DomainResourceCreate ] = (NvtxFunctionPointer)HandleDomainResourceCreate ;
+        *table[NVTX_CBID_CORE2_DomainResourceDestroy] = (NvtxFunctionPointer)HandleDomainResourceDestroy;
+        *table[NVTX_CBID_CORE2_DomainNameCategoryA  ] = (NvtxFunctionPointer)HandleDomainNameCategoryA  ;
+        *table[NVTX_CBID_CORE2_DomainNameCategoryW  ] = (NvtxFunctionPointer)HandleDomainNameCategoryW  ;
+        *table[NVTX_CBID_CORE2_DomainRegisterStringA] = (NvtxFunctionPointer)HandleDomainRegisterStringA;
+        *table[NVTX_CBID_CORE2_DomainRegisterStringW] = (NvtxFunctionPointer)HandleDomainRegisterStringW;
+        *table[NVTX_CBID_CORE2_DomainCreateA        ] = (NvtxFunctionPointer)HandleDomainCreateA        ;
+        *table[NVTX_CBID_CORE2_DomainCreateW        ] = (NvtxFunctionPointer)HandleDomainCreateW        ;
+        *table[NVTX_CBID_CORE2_DomainDestroy        ] = (NvtxFunctionPointer)HandleDomainDestroy        ;
+        *table[NVTX_CBID_CORE2_Initialize           ] = (NvtxFunctionPointer)HandleInitialize           ;
+    }
+
+    g_callbacks.Load(1);
+    return 1;
+}
diff --git a/tests/SelfInjection.h b/tests/SelfInjection.h
new file mode 100644
index 0000000..07a3461
--- /dev/null
+++ b/tests/SelfInjection.h
@@ -0,0 +1,688 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#define NVTX_NO_IMPL
+#include "nvtx3/nvToolsExt.h"
+
+#include "Same.h"
+#include "PrettyPrintersNvtxC.h"
+
+#include <functional>
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <map>
+#include <string>
+
+constexpr auto NVTX_TOOL_ATTACHED_UNUSED_RANGE_ID = static_cast<nvtxRangeId_t>(-1LL);
+constexpr int  NVTX_TOOL_ATTACHED_UNUSED_PUSH_POP_ID = -1;
+const     auto NVTX_TOOL_ATTACHED_UNUSED_DOMAIN_HANDLE = reinterpret_cast<nvtxDomainHandle_t>(-1LL);
+const     auto NVTX_TOOL_ATTACHED_UNUSED_STRING_HANDLE = reinterpret_cast<nvtxStringHandle_t>(-1LL);
+const     auto NVTX_TOOL_ATTACHED_UNUSED_RESOURCE_HANDLE = reinterpret_cast<nvtxResourceHandle_t>(-1LL);
+
+struct ArgsLoad { int success; };
+
+struct ArgsMarkEx        { const nvtxEventAttributes_t* eventAttrib; };
+struct ArgsMarkA         { const char* str                         ; };
+struct ArgsMarkW         { const wchar_t* str                      ; };
+struct ArgsRangeStartEx  { const nvtxEventAttributes_t* eventAttrib; };
+struct ArgsRangeStartA   { const char* str                         ; };
+struct ArgsRangeStartW   { const wchar_t* str                      ; };
+struct ArgsRangeEnd      { nvtxRangeId_t id                        ; };
+struct ArgsRangePushEx   { const nvtxEventAttributes_t* eventAttrib; };
+struct ArgsRangePushA    { const char* str                         ; };
+struct ArgsRangePushW    { const wchar_t* str                      ; };
+struct ArgsRangePop      {                                         ; };
+struct ArgsNameCategoryA { uint32_t id; const char* str            ; };
+struct ArgsNameCategoryW { uint32_t id; const wchar_t* str         ; };
+struct ArgsNameOsThreadA { uint32_t id; const char* str            ; };
+struct ArgsNameOsThreadW { uint32_t id; const wchar_t* str         ; };
+
+struct ArgsDomainMarkEx          { nvtxDomainHandle_t domain; const nvtxEventAttributes_t* eventAttrib; };
+struct ArgsDomainRangeStartEx    { nvtxDomainHandle_t domain; const nvtxEventAttributes_t* eventAttrib; };
+struct ArgsDomainRangeEnd        { nvtxDomainHandle_t domain; nvtxRangeId_t id                        ; };
+struct ArgsDomainRangePushEx     { nvtxDomainHandle_t domain; const nvtxEventAttributes_t* eventAttrib; };
+struct ArgsDomainRangePop        { nvtxDomainHandle_t domain                                          ; };
+struct ArgsDomainResourceCreate  { nvtxDomainHandle_t domain; nvtxResourceAttributes_t* attr          ; };
+struct ArgsDomainResourceDestroy { nvtxResourceHandle_t attr                                          ; };
+struct ArgsDomainNameCategoryA   { nvtxDomainHandle_t domain; uint32_t id; const char* str            ; };
+struct ArgsDomainNameCategoryW   { nvtxDomainHandle_t domain; uint32_t id; const wchar_t* str         ; };
+struct ArgsDomainRegisterStringA { nvtxDomainHandle_t domain; const char* str                         ; };
+struct ArgsDomainRegisterStringW { nvtxDomainHandle_t domain; const wchar_t* str                      ; };
+struct ArgsDomainCreateA         { const char* name                                                   ; };
+struct ArgsDomainCreateW         { const wchar_t* name                                                ; };
+struct ArgsDomainDestroy         { nvtxDomainHandle_t domain                                          ; };
+struct ArgsInitialize            { const void* reserved                                               ; };
+
+struct CallId
+{
+    NvtxCallbackModule mod;
+    int32_t cb;
+};
+DEFINE_SAME_2(CallId, mod, cb)
+
+// Helper to write CALLID(CORE, MarkEx) as shorthand for CallId{NVTX_CB_MODULE_CORE, NVTX_CBID_CORE_MarkEx}
+#define CALLID(m,c) CallId{NVTX_CB_MODULE_##m, (int32_t)NVTX_CBID_##m##_##c}
+
+#define CALLID_LOAD() CallId{NVTX_CB_MODULE_INVALID, (int32_t)0x7ac0be11}
+
+inline const char* CallName(CallId const& id)
+{
+    if (id == CALLID_LOAD()) return "InitializeInjectionNvtx2";
+    switch (id.mod)
+    {
+    case NVTX_CB_MODULE_CORE:
+        switch (id.cb)
+        {
+        case NVTX_CBID_CORE_MarkEx       : return "MarkEx";
+        case NVTX_CBID_CORE_MarkA        : return "MarkA";
+        case NVTX_CBID_CORE_MarkW        : return "MarkW";
+        case NVTX_CBID_CORE_RangeStartEx : return "RangeStartEx";
+        case NVTX_CBID_CORE_RangeStartA  : return "RangeStartA";
+        case NVTX_CBID_CORE_RangeStartW  : return "RangeStartW";
+        case NVTX_CBID_CORE_RangeEnd     : return "RangeEnd";
+        case NVTX_CBID_CORE_RangePushEx  : return "RangePushEx";
+        case NVTX_CBID_CORE_RangePushA   : return "RangePushA";
+        case NVTX_CBID_CORE_RangePushW   : return "RangePushW";
+        case NVTX_CBID_CORE_RangePop     : return "RangePop";
+        case NVTX_CBID_CORE_NameCategoryA: return "NameCategoryA";
+        case NVTX_CBID_CORE_NameCategoryW: return "NameCategoryW";
+        case NVTX_CBID_CORE_NameOsThreadA: return "NameOsThreadA";
+        case NVTX_CBID_CORE_NameOsThreadW: return "NameOsThreadW";
+        default: return "<Unknown CORE call>";
+        }
+    case NVTX_CB_MODULE_CORE2:
+        switch (id.cb)
+        {
+        case NVTX_CBID_CORE2_DomainMarkEx         : return "DomainMarkEx";
+        case NVTX_CBID_CORE2_DomainRangeStartEx   : return "DomainRangeStartEx";
+        case NVTX_CBID_CORE2_DomainRangeEnd       : return "DomainRangeEnd";
+        case NVTX_CBID_CORE2_DomainRangePushEx    : return "DomainRangePushEx";
+        case NVTX_CBID_CORE2_DomainRangePop       : return "DomainRangePop";
+        case NVTX_CBID_CORE2_DomainResourceCreate : return "DomainResourceCreate";
+        case NVTX_CBID_CORE2_DomainResourceDestroy: return "DomainResourceDestroy";
+        case NVTX_CBID_CORE2_DomainNameCategoryA  : return "DomainNameCategoryA";
+        case NVTX_CBID_CORE2_DomainNameCategoryW  : return "DomainNameCategoryW";
+        case NVTX_CBID_CORE2_DomainRegisterStringA: return "DomainRegisterStringA";
+        case NVTX_CBID_CORE2_DomainRegisterStringW: return "DomainRegisterStringW";
+        case NVTX_CBID_CORE2_DomainCreateA        : return "DomainCreateA";
+        case NVTX_CBID_CORE2_DomainCreateW        : return "DomainCreateW";
+        case NVTX_CBID_CORE2_DomainDestroy        : return "DomainDestroy";
+        case NVTX_CBID_CORE2_Initialize           : return "Initialize";
+        default: return "<Unknown CORE2 call>";
+        }
+    default: return "<Unknown CB_MODULE>";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, CallId const& id)
+{
+    return os << CallName(id);
+};
+
+union Args
+{
+    ArgsLoad          Load;
+
+    ArgsMarkEx        MarkEx       ;
+    ArgsMarkA         MarkA        ;
+    ArgsMarkW         MarkW        ;
+    ArgsRangeStartEx  RangeStartEx ;
+    ArgsRangeStartA   RangeStartA  ;
+    ArgsRangeStartW   RangeStartW  ;
+    ArgsRangeEnd      RangeEnd     ;
+    ArgsRangePushEx   RangePushEx  ;
+    ArgsRangePushA    RangePushA   ;
+    ArgsRangePushW    RangePushW   ;
+    ArgsRangePop      RangePop     ;
+    ArgsNameCategoryA NameCategoryA;
+    ArgsNameCategoryW NameCategoryW;
+    ArgsNameOsThreadA NameOsThreadA;
+    ArgsNameOsThreadW NameOsThreadW;
+
+    ArgsDomainMarkEx          DomainMarkEx         ;
+    ArgsDomainRangeStartEx    DomainRangeStartEx   ;
+    ArgsDomainRangeEnd        DomainRangeEnd       ;
+    ArgsDomainRangePushEx     DomainRangePushEx    ;
+    ArgsDomainRangePop        DomainRangePop       ;
+    ArgsDomainResourceCreate  DomainResourceCreate ;
+    ArgsDomainResourceDestroy DomainResourceDestroy;
+    ArgsDomainNameCategoryA   DomainNameCategoryA  ;
+    ArgsDomainNameCategoryW   DomainNameCategoryW  ;
+    ArgsDomainRegisterStringA DomainRegisterStringA;
+    ArgsDomainRegisterStringW DomainRegisterStringW;
+    ArgsDomainCreateA         DomainCreateA        ;
+    ArgsDomainCreateW         DomainCreateW        ;
+    ArgsDomainDestroy         DomainDestroy        ;
+    ArgsInitialize            Initialize           ;
+};
+
+// Free functions to emulate copy constructors and destructors for the NVTX C API types using pointers
+inline void CopyCstring(const char*& lhs, const char* rhs)
+{
+    size_t len = strlen(rhs) + 1;
+    auto* tmp = new char[len];
+    std::copy(rhs, rhs + len, tmp);
+    lhs = tmp;
+}
+inline void CopyCstring(const char*& s) { CopyCstring(s, s); }
+inline void DestroyCstring(const char* s) { delete[] s; }
+
+inline void CopyCstring(const wchar_t*& lhs, const wchar_t* rhs)
+{
+    size_t len = wcslen(rhs) + 1;
+    auto* tmp = new wchar_t[len];
+    std::copy(rhs, rhs + len, tmp);
+    lhs = tmp;
+}
+inline void CopyCstring(const wchar_t*& s) { CopyCstring(s, s); }
+inline void DestroyCstring(const wchar_t* s) { delete[] s; }
+
+inline void CopyEventAttributes(const nvtxEventAttributes_t*& lhs, const nvtxEventAttributes_t* rhs)
+{
+    auto* tmp = new nvtxEventAttributes_t;
+    memcpy(tmp, rhs, sizeof(*tmp));
+    switch (tmp->messageType)
+    {
+    case NVTX_MESSAGE_TYPE_ASCII:   CopyCstring(tmp->message.ascii);   break;
+    case NVTX_MESSAGE_TYPE_UNICODE: CopyCstring(tmp->message.unicode); break;
+    }
+    lhs = tmp;
+}
+inline void CopyEventAttributes(const nvtxEventAttributes_t*& a) { CopyEventAttributes(a, a); }
+inline void DestroyEventAttributes(const nvtxEventAttributes_t* a)
+{
+    switch (a->messageType)
+    {
+    case NVTX_MESSAGE_TYPE_ASCII:   DestroyCstring(a->message.ascii);   break;
+    case NVTX_MESSAGE_TYPE_UNICODE: DestroyCstring(a->message.unicode); break;
+    }
+    delete a;
+}
+
+inline void CopyResourceAttributes(nvtxResourceAttributes_t*& lhs, const nvtxResourceAttributes_t* rhs)
+{
+    auto* tmp = new nvtxResourceAttributes_t;
+    memcpy(tmp, rhs, sizeof(*tmp));
+    switch (tmp->messageType)
+    {
+    case NVTX_MESSAGE_TYPE_ASCII:   CopyCstring(tmp->message.ascii);   break;
+    case NVTX_MESSAGE_TYPE_UNICODE: CopyCstring(tmp->message.unicode); break;
+    }
+    lhs = tmp;
+}
+inline void CopyResourceAttributes(nvtxResourceAttributes_t*& a) { CopyResourceAttributes(a, a); }
+inline void DestroyResourceAttributes(nvtxResourceAttributes_t* a)
+{
+    switch (a->messageType)
+    {
+    case NVTX_MESSAGE_TYPE_ASCII:   DestroyCstring(a->message.ascii);   break;
+    case NVTX_MESSAGE_TYPE_UNICODE: DestroyCstring(a->message.unicode); break;
+    }
+    delete a;
+}
+
+template <typename ArgsT> inline void DeepCopyAssign(ArgsT& lhs, ArgsT const& rhs) { lhs = rhs; }
+
+template <> inline void DeepCopyAssign(ArgsMarkEx       & lhs, ArgsMarkEx        const& rhs) { lhs = rhs; CopyEventAttributes(lhs.eventAttrib); }
+template <> inline void DeepCopyAssign(ArgsMarkA        & lhs, ArgsMarkA         const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsMarkW        & lhs, ArgsMarkW         const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsRangeStartEx & lhs, ArgsRangeStartEx  const& rhs) { lhs = rhs; CopyEventAttributes(lhs.eventAttrib); }
+template <> inline void DeepCopyAssign(ArgsRangeStartA  & lhs, ArgsRangeStartA   const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsRangeStartW  & lhs, ArgsRangeStartW   const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsRangeEnd     & lhs, ArgsRangeEnd      const& rhs) { lhs = rhs; }
+template <> inline void DeepCopyAssign(ArgsRangePushEx  & lhs, ArgsRangePushEx   const& rhs) { lhs = rhs; CopyEventAttributes(lhs.eventAttrib); }
+template <> inline void DeepCopyAssign(ArgsRangePushA   & lhs, ArgsRangePushA    const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsRangePushW   & lhs, ArgsRangePushW    const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsRangePop     & lhs, ArgsRangePop      const& rhs) { lhs = rhs; }
+template <> inline void DeepCopyAssign(ArgsNameCategoryA& lhs, ArgsNameCategoryA const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsNameCategoryW& lhs, ArgsNameCategoryW const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsNameOsThreadA& lhs, ArgsNameOsThreadA const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsNameOsThreadW& lhs, ArgsNameOsThreadW const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+
+template <> inline void DeepCopyAssign(ArgsDomainMarkEx         & lhs, ArgsDomainMarkEx          const& rhs) { lhs = rhs; CopyEventAttributes(lhs.eventAttrib); }
+template <> inline void DeepCopyAssign(ArgsDomainRangeStartEx   & lhs, ArgsDomainRangeStartEx    const& rhs) { lhs = rhs; CopyEventAttributes(lhs.eventAttrib); }
+template <> inline void DeepCopyAssign(ArgsDomainRangeEnd       & lhs, ArgsDomainRangeEnd        const& rhs) { lhs = rhs; }
+template <> inline void DeepCopyAssign(ArgsDomainRangePushEx    & lhs, ArgsDomainRangePushEx     const& rhs) { lhs = rhs; CopyEventAttributes(lhs.eventAttrib); }
+template <> inline void DeepCopyAssign(ArgsDomainRangePop       & lhs, ArgsDomainRangePop        const& rhs) { lhs = rhs; }
+template <> inline void DeepCopyAssign(ArgsDomainResourceCreate & lhs, ArgsDomainResourceCreate  const& rhs) { lhs = rhs; CopyResourceAttributes(lhs.attr); }
+template <> inline void DeepCopyAssign(ArgsDomainResourceDestroy& lhs, ArgsDomainResourceDestroy const& rhs) { lhs = rhs; }
+template <> inline void DeepCopyAssign(ArgsDomainNameCategoryA  & lhs, ArgsDomainNameCategoryA   const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsDomainNameCategoryW  & lhs, ArgsDomainNameCategoryW   const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsDomainRegisterStringA& lhs, ArgsDomainRegisterStringA const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsDomainRegisterStringW& lhs, ArgsDomainRegisterStringW const& rhs) { lhs = rhs; CopyCstring(lhs.str); }
+template <> inline void DeepCopyAssign(ArgsDomainCreateA        & lhs, ArgsDomainCreateA         const& rhs) { lhs = rhs; CopyCstring(lhs.name); }
+template <> inline void DeepCopyAssign(ArgsDomainCreateW        & lhs, ArgsDomainCreateW         const& rhs) { lhs = rhs; CopyCstring(lhs.name); }
+template <> inline void DeepCopyAssign(ArgsDomainDestroy        & lhs, ArgsDomainDestroy         const& rhs) { lhs = rhs; }
+template <> inline void DeepCopyAssign(ArgsInitialize           & lhs, ArgsInitialize            const& rhs) { lhs = rhs; }
+
+template <typename ArgsT> inline  void DeepCopyDestroy(ArgsT&) {}
+
+template <> inline void DeepCopyDestroy(ArgsMarkEx       & args) { DestroyEventAttributes(args.eventAttrib); }
+template <> inline void DeepCopyDestroy(ArgsMarkA        & args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsMarkW        & args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsRangeStartEx & args) { DestroyEventAttributes(args.eventAttrib); }
+template <> inline void DeepCopyDestroy(ArgsRangeStartA  & args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsRangeStartW  & args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsRangeEnd     & args) { }
+template <> inline void DeepCopyDestroy(ArgsRangePushEx  & args) { DestroyEventAttributes(args.eventAttrib); }
+template <> inline void DeepCopyDestroy(ArgsRangePushA   & args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsRangePushW   & args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsRangePop     & args) { }
+template <> inline void DeepCopyDestroy(ArgsNameCategoryA& args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsNameCategoryW& args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsNameOsThreadA& args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsNameOsThreadW& args) { DestroyCstring(args.str); }
+
+template <> inline void DeepCopyDestroy(ArgsDomainMarkEx         & args) { DestroyEventAttributes(args.eventAttrib); }
+template <> inline void DeepCopyDestroy(ArgsDomainRangeStartEx   & args) { DestroyEventAttributes(args.eventAttrib); }
+template <> inline void DeepCopyDestroy(ArgsDomainRangeEnd       & args) { }
+template <> inline void DeepCopyDestroy(ArgsDomainRangePushEx    & args) { DestroyEventAttributes(args.eventAttrib); }
+template <> inline void DeepCopyDestroy(ArgsDomainRangePop       & args) { }
+template <> inline void DeepCopyDestroy(ArgsDomainResourceCreate & args) { DestroyResourceAttributes(args.attr); }
+template <> inline void DeepCopyDestroy(ArgsDomainResourceDestroy& args) { }
+template <> inline void DeepCopyDestroy(ArgsDomainNameCategoryA  & args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsDomainNameCategoryW  & args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsDomainRegisterStringA& args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsDomainRegisterStringW& args) { DestroyCstring(args.str); }
+template <> inline void DeepCopyDestroy(ArgsDomainCreateA        & args) { DestroyCstring(args.name); }
+template <> inline void DeepCopyDestroy(ArgsDomainCreateW        & args) { DestroyCstring(args.name); }
+template <> inline void DeepCopyDestroy(ArgsDomainDestroy        & args) { }
+template <> inline void DeepCopyDestroy(ArgsInitialize           & args) { }
+
+struct CallData
+{
+    CallId id{NVTX_CB_MODULE_INVALID, 0};
+    Args args;
+
+    ~CallData()
+    {
+        switch (id.mod)
+        {
+        case NVTX_CB_MODULE_CORE:
+            switch (id.cb)
+            {
+            case NVTX_CBID_CORE_MarkEx       : DeepCopyDestroy(args.MarkEx       ); break;
+            case NVTX_CBID_CORE_MarkA        : DeepCopyDestroy(args.MarkA        ); break;
+            case NVTX_CBID_CORE_MarkW        : DeepCopyDestroy(args.MarkW        ); break;
+            case NVTX_CBID_CORE_RangeStartEx : DeepCopyDestroy(args.RangeStartEx ); break;
+            case NVTX_CBID_CORE_RangeStartA  : DeepCopyDestroy(args.RangeStartA  ); break;
+            case NVTX_CBID_CORE_RangeStartW  : DeepCopyDestroy(args.RangeStartW  ); break;
+            case NVTX_CBID_CORE_RangeEnd     : DeepCopyDestroy(args.RangeEnd     ); break;
+            case NVTX_CBID_CORE_RangePushEx  : DeepCopyDestroy(args.RangePushEx  ); break;
+            case NVTX_CBID_CORE_RangePushA   : DeepCopyDestroy(args.RangePushA   ); break;
+            case NVTX_CBID_CORE_RangePushW   : DeepCopyDestroy(args.RangePushW   ); break;
+            case NVTX_CBID_CORE_RangePop     : DeepCopyDestroy(args.RangePop     ); break;
+            case NVTX_CBID_CORE_NameCategoryA: DeepCopyDestroy(args.NameCategoryA); break;
+            case NVTX_CBID_CORE_NameCategoryW: DeepCopyDestroy(args.NameCategoryW); break;
+            case NVTX_CBID_CORE_NameOsThreadA: DeepCopyDestroy(args.NameOsThreadA); break;
+            case NVTX_CBID_CORE_NameOsThreadW: DeepCopyDestroy(args.NameOsThreadW); break;
+            default: break;
+            }
+            break;
+        case NVTX_CB_MODULE_CORE2:
+            switch (id.cb)
+            {
+            case NVTX_CBID_CORE2_DomainMarkEx         : DeepCopyDestroy(args.DomainMarkEx         ); break;
+            case NVTX_CBID_CORE2_DomainRangeStartEx   : DeepCopyDestroy(args.DomainRangeStartEx   ); break;
+            case NVTX_CBID_CORE2_DomainRangeEnd       : DeepCopyDestroy(args.DomainRangeEnd       ); break;
+            case NVTX_CBID_CORE2_DomainRangePushEx    : DeepCopyDestroy(args.DomainRangePushEx    ); break;
+            case NVTX_CBID_CORE2_DomainRangePop       : DeepCopyDestroy(args.DomainRangePop       ); break;
+            case NVTX_CBID_CORE2_DomainResourceCreate : DeepCopyDestroy(args.DomainResourceCreate ); break;
+            case NVTX_CBID_CORE2_DomainResourceDestroy: DeepCopyDestroy(args.DomainResourceDestroy); break;
+            case NVTX_CBID_CORE2_DomainNameCategoryA  : DeepCopyDestroy(args.DomainNameCategoryA  ); break;
+            case NVTX_CBID_CORE2_DomainNameCategoryW  : DeepCopyDestroy(args.DomainNameCategoryW  ); break;
+            case NVTX_CBID_CORE2_DomainRegisterStringA: DeepCopyDestroy(args.DomainRegisterStringA); break;
+            case NVTX_CBID_CORE2_DomainRegisterStringW: DeepCopyDestroy(args.DomainRegisterStringW); break;
+            case NVTX_CBID_CORE2_DomainCreateA        : DeepCopyDestroy(args.DomainCreateA        ); break;
+            case NVTX_CBID_CORE2_DomainCreateW        : DeepCopyDestroy(args.DomainCreateW        ); break;
+            case NVTX_CBID_CORE2_DomainDestroy        : DeepCopyDestroy(args.DomainDestroy        ); break;
+            case NVTX_CBID_CORE2_Initialize           : DeepCopyDestroy(args.Initialize           ); break;
+            default: break;
+            }
+            break;
+        default: break;
+        }
+    }
+};
+
+inline std::ostream& operator<<(std::ostream& os, CallData const& data)
+{
+    if (data.id == CALLID_LOAD())
+    {
+        return os << CallName(data.id) << " returned " << data.args.Load.success;
+    }
+
+    os << "[" << data.id.mod << "," << std::setw(2) << data.id.cb << "] ";
+    os << CallName(data.id) << '(';
+    switch (data.id.mod)
+    {
+    case NVTX_CB_MODULE_CORE:
+        switch (data.id.cb)
+        {
+        case NVTX_CBID_CORE_MarkEx       : {auto& a = data.args.MarkEx       ; os << *a.eventAttrib;                 } break;
+        case NVTX_CBID_CORE_MarkA        : {auto& a = data.args.MarkA        ; os << '"' << a.str << '"';            } break;
+        case NVTX_CBID_CORE_MarkW        : {auto& a = data.args.MarkW        ; os << "WIDE";                         } break;
+        case NVTX_CBID_CORE_RangeStartEx : {auto& a = data.args.RangeStartEx ; os << *a.eventAttrib;                 } break;
+        case NVTX_CBID_CORE_RangeStartA  : {auto& a = data.args.RangeStartA  ; os << '"' << a.str << '"';            } break;
+        case NVTX_CBID_CORE_RangeStartW  : {auto& a = data.args.RangeStartW  ; os << "WIDE";                         } break;
+        case NVTX_CBID_CORE_RangeEnd     : {auto& a = data.args.RangeEnd     ; os << a.id;                           } break;
+        case NVTX_CBID_CORE_RangePushEx  : {auto& a = data.args.RangePushEx  ; os << *a.eventAttrib;                 } break;
+        case NVTX_CBID_CORE_RangePushA   : {auto& a = data.args.RangePushA   ; os << '"' << a.str << '"';            } break;
+        case NVTX_CBID_CORE_RangePushW   : {auto& a = data.args.RangePushW   ; os << "WIDE";                         } break;
+        case NVTX_CBID_CORE_RangePop     : {auto& a = data.args.RangePop     ;                                       } break;
+        case NVTX_CBID_CORE_NameCategoryA: {auto& a = data.args.NameCategoryA; os << a.id << ", \"" << a.str << '"'; } break;
+        case NVTX_CBID_CORE_NameCategoryW: {auto& a = data.args.NameCategoryW; os << a.id << ", " << "WIDE";         } break;
+        case NVTX_CBID_CORE_NameOsThreadA: {auto& a = data.args.NameOsThreadA; os << a.id << ", \"" << a.str << '"'; } break;
+        case NVTX_CBID_CORE_NameOsThreadW: {auto& a = data.args.NameOsThreadW; os << a.id << ", " << "WIDE";         } break;
+        default: break;
+        }
+        break;
+    case NVTX_CB_MODULE_CORE2:
+        switch (data.id.cb)
+        {
+        case NVTX_CBID_CORE2_DomainMarkEx         : {auto& a = data.args.DomainMarkEx         ; os << a.domain << ", " << *a.eventAttrib;                } break;
+        case NVTX_CBID_CORE2_DomainRangeStartEx   : {auto& a = data.args.DomainRangeStartEx   ; os << a.domain << ", " << *a.eventAttrib;                } break;
+        case NVTX_CBID_CORE2_DomainRangeEnd       : {auto& a = data.args.DomainRangeEnd       ; os << a.domain << ", " << a.id;                          } break;
+        case NVTX_CBID_CORE2_DomainRangePushEx    : {auto& a = data.args.DomainRangePushEx    ; os << a.domain << ", " << *a.eventAttrib;                } break;
+        case NVTX_CBID_CORE2_DomainRangePop       : {auto& a = data.args.DomainRangePop       ; os << a.domain;                                          } break;
+        case NVTX_CBID_CORE2_DomainResourceCreate : {auto& a = data.args.DomainResourceCreate ; os << a.domain << ", " << a.attr;                        } break; // TODO
+        case NVTX_CBID_CORE2_DomainResourceDestroy: {auto& a = data.args.DomainResourceDestroy; os << a.attr;                                            } break;
+        case NVTX_CBID_CORE2_DomainNameCategoryA  : {auto& a = data.args.DomainNameCategoryA  ; os << a.domain << ", " << a.id << ", \"" << a.str << '"';} break;
+        case NVTX_CBID_CORE2_DomainNameCategoryW  : {auto& a = data.args.DomainNameCategoryW  ; os << a.domain << ", " << a.id << ", " << "WIDE";        } break;
+        case NVTX_CBID_CORE2_DomainRegisterStringA: {auto& a = data.args.DomainRegisterStringA; os << a.domain << ", \"" << a.str << '"';                } break;
+        case NVTX_CBID_CORE2_DomainRegisterStringW: {auto& a = data.args.DomainRegisterStringW; os << a.domain << ", " << "WIDE";                        } break;
+        case NVTX_CBID_CORE2_DomainCreateA        : {auto& a = data.args.DomainCreateA        ; os << '"' << a.name << '"';                              } break;
+        case NVTX_CBID_CORE2_DomainCreateW        : {auto& a = data.args.DomainCreateW        ; os << "WIDE";                                            } break;
+        case NVTX_CBID_CORE2_DomainDestroy        : {auto& a = data.args.DomainDestroy        ; os << a.domain;                                          } break;
+        case NVTX_CBID_CORE2_Initialize           : {auto& a = data.args.Initialize           ; os << a.reserved;                                        } break;
+        default: break;
+        }
+        break;
+    default: break;
+    }
+    os << ')';
+    return os;
+};
+
+using Call = std::shared_ptr<CallData>;
+
+// Helper to write CALL(CORE, NameCategoryA, id, str) to construct a Call with arg values
+#define CALL(m,c,...) [=]{ Call v(new CallData); v->id = CALLID(m,c); DeepCopyAssign(v->args.c, Args##c{__VA_ARGS__}); return v; }()
+
+#define CALL_LOAD(s) [=]{ Call v(new CallData); v->id = CALLID_LOAD(); v->args.Load = ArgsLoad{s}; return v; }()
+
+// Helpers to construct unions from NVTX C API types
+inline nvtxMessageValue_t MakeMessage(const char*        msg) { nvtxMessageValue_t v; v.ascii      = msg; return v; }
+inline nvtxMessageValue_t MakeMessage(const wchar_t*     msg) { nvtxMessageValue_t v; v.unicode    = msg; return v; }
+inline nvtxMessageValue_t MakeMessage(nvtxStringHandle_t msg) { nvtxMessageValue_t v; v.registered = msg; return v; }
+
+inline nvtxEventAttributes_t::payload_t MakePayload(uint64_t v) { nvtxEventAttributes_t::payload_t p; p.ullValue = v; return p; }
+inline nvtxEventAttributes_t::payload_t MakePayload(int64_t  v) { nvtxEventAttributes_t::payload_t p; p.llValue  = v; return p; }
+inline nvtxEventAttributes_t::payload_t MakePayload(double   v) { nvtxEventAttributes_t::payload_t p; p.dValue   = v; return p; }
+inline nvtxEventAttributes_t::payload_t MakePayload(uint32_t v) { nvtxEventAttributes_t::payload_t p; p.uiValue  = v; return p; }
+inline nvtxEventAttributes_t::payload_t MakePayload(int32_t  v) { nvtxEventAttributes_t::payload_t p; p.iValue   = v; return p; }
+inline nvtxEventAttributes_t::payload_t MakePayload(float    v) { nvtxEventAttributes_t::payload_t p; p.fValue   = v; return p; }
+
+// Define Same() overloads for NVTX API types
+inline bool Same(nvtxEventAttributes_t const& lhs, nvtxEventAttributes_t const& rhs, SAME_COMMON_ARGS)
+{
+    bool same = true
+        && MEMBER_SAME(version)
+        && MEMBER_SAME(size)
+        && MEMBER_SAME(category)
+        && MEMBER_SAME(colorType)
+        && MEMBER_SAME(color)
+        && MEMBER_SAME(payloadType)
+        && (false
+            || lhs.payloadType == NVTX_PAYLOAD_UNKNOWN
+            || (lhs.payloadType == NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 && MEMBER_SAME(payload.ullValue))
+            || (lhs.payloadType == NVTX_PAYLOAD_TYPE_INT64          && MEMBER_SAME(payload.llValue))
+            || (lhs.payloadType == NVTX_PAYLOAD_TYPE_DOUBLE         && MEMBER_SAME(payload.dValue))
+            || (lhs.payloadType == NVTX_PAYLOAD_TYPE_UNSIGNED_INT32 && MEMBER_SAME(payload.uiValue))
+            || (lhs.payloadType == NVTX_PAYLOAD_TYPE_INT32          && MEMBER_SAME(payload.iValue))
+            || (lhs.payloadType == NVTX_PAYLOAD_TYPE_FLOAT          && MEMBER_SAME(payload.fValue))
+            )
+        && MEMBER_SAME(messageType)
+        && (false
+            || lhs.messageType == NVTX_MESSAGE_UNKNOWN
+            || (lhs.messageType == NVTX_MESSAGE_TYPE_ASCII      && MEMBER_SAME(message.ascii))
+            || (lhs.messageType == NVTX_MESSAGE_TYPE_UNICODE    && MEMBER_SAME(message.unicode))
+            || (lhs.messageType == NVTX_MESSAGE_TYPE_REGISTERED && MEMBER_SAME(message.registered))
+            )
+        ;
+    VERBOSE_PRINT()
+        << std::string(depth, ' ') << "Expected: " << rhs << "\n"
+        << std::string(depth, ' ') << "Provided: " << lhs << "\n";
+    return same;
+}
+DEFINE_EQ_NE_DEEP(nvtxEventAttributes_t)
+
+inline bool Same(nvtxResourceAttributes_t const& lhs, nvtxResourceAttributes_t const& rhs, SAME_COMMON_ARGS)
+{
+    bool same = true
+        && MEMBER_SAME(version)
+        && MEMBER_SAME(size)
+        && MEMBER_SAME(identifierType)
+        && (false
+            || lhs.identifierType == NVTX_RESOURCE_TYPE_UNKNOWN
+            || (lhs.identifierType == NVTX_RESOURCE_TYPE_GENERIC_POINTER       && MEMBER_SAME(identifier.pValue))
+            || (lhs.identifierType == NVTX_RESOURCE_TYPE_GENERIC_HANDLE        && MEMBER_SAME(identifier.ullValue))
+            || (lhs.identifierType == NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE && MEMBER_SAME(identifier.ullValue))
+            || (lhs.identifierType == NVTX_RESOURCE_TYPE_GENERIC_THREAD_POSIX  && MEMBER_SAME(identifier.ullValue))
+            )
+        && MEMBER_SAME(messageType)
+        && (false
+            || lhs.messageType == NVTX_MESSAGE_UNKNOWN
+            || (lhs.messageType == NVTX_MESSAGE_TYPE_ASCII      && MEMBER_SAME(message.ascii))
+            || (lhs.messageType == NVTX_MESSAGE_TYPE_UNICODE    && MEMBER_SAME(message.unicode))
+            || (lhs.messageType == NVTX_MESSAGE_TYPE_REGISTERED && MEMBER_SAME(message.registered))
+            )
+        ;
+    VERBOSE_PRINT();
+    return same;
+}
+DEFINE_EQ_NE_DEEP(nvtxResourceAttributes_t)
+
+// Define Same() overloads (and operators == and !=) for NVTX arg pack types & Args union
+
+#define DEFINE_ARGS_SAME_0(cb)          DEFINE_SAME_0(Args##cb)
+#define DEFINE_ARGS_SAME_1(cb, a)       DEFINE_SAME_1(Args##cb, a)
+#define DEFINE_ARGS_SAME_2(cb, a, b)    DEFINE_SAME_2(Args##cb, a, b)
+#define DEFINE_ARGS_SAME_3(cb, a, b, c) DEFINE_SAME_3(Args##cb, a, b, c)
+
+DEFINE_ARGS_SAME_1(Load, success)
+// CORE
+DEFINE_ARGS_SAME_1(MarkEx, eventAttrib)
+DEFINE_ARGS_SAME_1(MarkA, str)
+DEFINE_ARGS_SAME_1(MarkW, str)
+DEFINE_ARGS_SAME_1(RangeStartEx, eventAttrib)
+DEFINE_ARGS_SAME_1(RangeStartA, str)
+DEFINE_ARGS_SAME_1(RangeStartW, str)
+DEFINE_ARGS_SAME_0(RangeEnd)
+DEFINE_ARGS_SAME_1(RangePushEx, eventAttrib)
+DEFINE_ARGS_SAME_1(RangePushA, str)
+DEFINE_ARGS_SAME_1(RangePushW, str)
+DEFINE_ARGS_SAME_0(RangePop)
+DEFINE_ARGS_SAME_2(NameCategoryA, id, str)
+DEFINE_ARGS_SAME_2(NameCategoryW, id, str)
+DEFINE_ARGS_SAME_2(NameOsThreadA, id, str)
+DEFINE_ARGS_SAME_2(NameOsThreadW, id, str)
+// CORE2
+DEFINE_ARGS_SAME_2(DomainMarkEx, domain, eventAttrib)
+DEFINE_ARGS_SAME_2(DomainRangeStartEx, domain, eventAttrib)
+DEFINE_ARGS_SAME_2(DomainRangeEnd, domain, id)
+DEFINE_ARGS_SAME_2(DomainRangePushEx, domain, eventAttrib)
+DEFINE_ARGS_SAME_1(DomainRangePop, domain)
+DEFINE_ARGS_SAME_2(DomainResourceCreate, domain, attr)
+DEFINE_ARGS_SAME_1(DomainResourceDestroy, attr)
+DEFINE_ARGS_SAME_3(DomainNameCategoryA, domain, id, str)
+DEFINE_ARGS_SAME_3(DomainNameCategoryW, domain, id, str)
+DEFINE_ARGS_SAME_2(DomainRegisterStringA, domain, str)
+DEFINE_ARGS_SAME_2(DomainRegisterStringW, domain, str)
+DEFINE_ARGS_SAME_1(DomainCreateA, name)
+DEFINE_ARGS_SAME_1(DomainCreateW, name)
+DEFINE_ARGS_SAME_1(DomainDestroy, domain)
+DEFINE_ARGS_SAME_1(Initialize, reserved)
+
+inline bool Same(CallData const& lhs, CallData const& rhs, SAME_COMMON_ARGS)
+{
+    bool same = true
+        && MEMBER_SAME(id)
+        && (false
+            || UNION_MEMBER_SAME(id, CALLID_LOAD(), args.Load)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, MarkEx), args.MarkEx)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, MarkA), args.MarkA)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, MarkW), args.MarkW)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, RangeStartEx), args.RangeStartEx)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, RangeStartA), args.RangeStartA)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, RangeStartW), args.RangeStartW)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, RangeEnd), args.RangeEnd)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, RangePushEx), args.RangePushEx)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, RangePushA), args.RangePushA)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, RangePushW), args.RangePushW)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, RangePop), args.RangePop)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, NameCategoryA), args.NameCategoryA)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, NameCategoryW), args.NameCategoryW)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, NameOsThreadA), args.NameOsThreadA)
+            || UNION_MEMBER_SAME(id, CALLID(CORE, NameOsThreadW), args.NameOsThreadW)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainMarkEx), args.DomainMarkEx)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainRangeStartEx), args.DomainRangeStartEx)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainRangeEnd), args.DomainRangeEnd)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainRangePushEx), args.DomainRangePushEx)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainRangePop), args.DomainRangePop)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainResourceCreate), args.DomainResourceCreate)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainResourceDestroy), args.DomainResourceDestroy)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainNameCategoryA), args.DomainNameCategoryA)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainNameCategoryW), args.DomainNameCategoryW)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainRegisterStringA), args.DomainRegisterStringA)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainRegisterStringW), args.DomainRegisterStringW)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainCreateA), args.DomainCreateA)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainCreateW), args.DomainCreateW)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, DomainDestroy), args.DomainDestroy)
+            || UNION_MEMBER_SAME(id, CALLID(CORE2, Initialize), args.Initialize)
+            )
+        ;
+
+    VERBOSE_PRINT();
+    return same;
+}
+DEFINE_EQ_NE_DEEP(CallData)
+
+inline nvtxDomainHandle_t   PostInc(nvtxDomainHandle_t  & h) { auto v = h; ++(intptr_t&)h; return v; }
+inline nvtxStringHandle_t   PostInc(nvtxStringHandle_t  & h) { auto v = h; ++(intptr_t&)h; return v; }
+inline nvtxResourceHandle_t PostInc(nvtxResourceHandle_t& h) { auto v = h; ++(intptr_t&)h; return v; }
+inline nvtxRangeId_t        PostInc(nvtxRangeId_t       & h) { return h++; }
+
+struct Callbacks
+{
+    std::function<void(Call const&)> Default;
+    std::function<void(int)> Load;
+
+    std::function<void(const nvtxEventAttributes_t*)> MarkEx;
+    std::function<void(const char*)> MarkA;
+    std::function<void(const wchar_t*)> MarkW;
+    std::function<nvtxRangeId_t(const nvtxEventAttributes_t*)> RangeStartEx;
+    std::function<nvtxRangeId_t(const char*)> RangeStartA;
+    std::function<nvtxRangeId_t(const wchar_t*)> RangeStartW;
+    std::function<void(nvtxRangeId_t)> RangeEnd;
+    std::function<int(const nvtxEventAttributes_t*)> RangePushEx;
+    std::function<int(const char*)> RangePushA;
+    std::function<int(const wchar_t*)> RangePushW;
+    std::function<int()> RangePop;
+    std::function<void(uint32_t, const char*)> NameCategoryA;
+    std::function<void(uint32_t, const wchar_t*)> NameCategoryW;
+    std::function<void(uint32_t, const char*)> NameOsThreadA;
+    std::function<void(uint32_t, const wchar_t*)> NameOsThreadW;
+
+    std::function<void(nvtxDomainHandle_t, const nvtxEventAttributes_t*)> DomainMarkEx;
+    std::function<nvtxRangeId_t(nvtxDomainHandle_t, const nvtxEventAttributes_t*)> DomainRangeStartEx;
+    std::function<void(nvtxDomainHandle_t, nvtxRangeId_t)> DomainRangeEnd;
+    std::function<int(nvtxDomainHandle_t, const nvtxEventAttributes_t*)> DomainRangePushEx;
+    std::function<int(nvtxDomainHandle_t)> DomainRangePop;
+    std::function<nvtxResourceHandle_t(nvtxDomainHandle_t, nvtxResourceAttributes_t*)> DomainResourceCreate;
+    std::function<void(nvtxResourceHandle_t)> DomainResourceDestroy;
+    std::function<void(nvtxDomainHandle_t, uint32_t, const char*)> DomainNameCategoryA;
+    std::function<void(nvtxDomainHandle_t, uint32_t, const wchar_t*)> DomainNameCategoryW;
+    std::function<nvtxStringHandle_t(nvtxDomainHandle_t, const char*)> DomainRegisterStringA;
+    std::function<nvtxStringHandle_t(nvtxDomainHandle_t, const wchar_t*)> DomainRegisterStringW;
+    std::function<nvtxDomainHandle_t(const char*)> DomainCreateA;
+    std::function<nvtxDomainHandle_t(const wchar_t*)> DomainCreateW;
+    std::function<void(nvtxDomainHandle_t)> DomainDestroy;
+    std::function<void(const void*)> Initialize;
+
+
+    Callbacks(Callbacks const&) = default;
+    Callbacks& operator=(Callbacks const&) = default;
+    Callbacks(Callbacks&&) = default;
+    Callbacks& operator=(Callbacks&&) = default;
+
+    nvtxDomainHandle_t nextDomainHandle = (nvtxDomainHandle_t)1;
+    struct DomainData
+    {
+        int pushPopDepth = 0;
+        nvtxRangeId_t nextRangeId = (nvtxRangeId_t)1;
+        nvtxStringHandle_t nextStringHandle = (nvtxStringHandle_t)1;
+        nvtxResourceHandle_t nextResourceHandle = (nvtxResourceHandle_t)1;
+    };
+    std::map<nvtxDomainHandle_t, DomainData> domainData;
+
+    Callbacks()
+    : Default([](Call const&) {})
+    , Load   ([&](int success) { Default(CALL_LOAD(success)); })
+    // CORE
+    , MarkEx       ([&](const nvtxEventAttributes_t* a) { Default(CALL(CORE, MarkEx       , a   )); })
+    , MarkA        ([&](const char*                  a) { Default(CALL(CORE, MarkA        , a   )); })
+    , MarkW        ([&](const wchar_t*               a) { Default(CALL(CORE, MarkW        , a   )); })
+    , RangeStartEx ([&](const nvtxEventAttributes_t* a) { Default(CALL(CORE, RangeStartEx , a   )); return PostInc(domainData[nullptr].nextRangeId); })
+    , RangeStartA  ([&](const char*                  a) { Default(CALL(CORE, RangeStartA  , a   )); return PostInc(domainData[nullptr].nextRangeId); })
+    , RangeStartW  ([&](const wchar_t*               a) { Default(CALL(CORE, RangeStartW  , a   )); return PostInc(domainData[nullptr].nextRangeId); })
+    , RangeEnd     ([&](nvtxRangeId_t                a) { Default(CALL(CORE, RangeEnd     , a   )); })
+    , RangePushEx  ([&](const nvtxEventAttributes_t* a) { Default(CALL(CORE, RangePushEx  , a   )); return ++domainData[nullptr].pushPopDepth; })
+    , RangePushA   ([&](const char*                  a) { Default(CALL(CORE, RangePushA   , a   )); return ++domainData[nullptr].pushPopDepth; })
+    , RangePushW   ([&](const wchar_t*               a) { Default(CALL(CORE, RangePushW   , a   )); return ++domainData[nullptr].pushPopDepth; })
+    , RangePop     ([&](                              ) { Default(CALL(CORE, RangePop           )); return domainData[nullptr].pushPopDepth--; })
+    , NameCategoryA([&](uint32_t a, const char*      b) { Default(CALL(CORE, NameCategoryA, a, b)); })
+    , NameCategoryW([&](uint32_t a, const wchar_t*   b) { Default(CALL(CORE, NameCategoryW, a, b)); })
+    , NameOsThreadA([&](uint32_t a, const char*      b) { Default(CALL(CORE, NameOsThreadA, a, b)); })
+    , NameOsThreadW([&](uint32_t a, const wchar_t*   b) { Default(CALL(CORE, NameOsThreadW, a, b)); })
+    // CORE2
+    , DomainMarkEx         ([&](nvtxDomainHandle_t a, const nvtxEventAttributes_t* b) { Default(CALL(CORE2, DomainMarkEx         , a, b   )); })
+    , DomainRangeStartEx   ([&](nvtxDomainHandle_t a, const nvtxEventAttributes_t* b) { Default(CALL(CORE2, DomainRangeStartEx   , a, b   )); return PostInc(domainData[a].nextRangeId); })
+    , DomainRangeEnd       ([&](nvtxDomainHandle_t a, nvtxRangeId_t                b) { Default(CALL(CORE2, DomainRangeEnd       , a, b   )); })
+    , DomainRangePushEx    ([&](nvtxDomainHandle_t a, const nvtxEventAttributes_t* b) { Default(CALL(CORE2, DomainRangePushEx    , a, b   )); return ++domainData[a].pushPopDepth; })
+    , DomainRangePop       ([&](nvtxDomainHandle_t                                 a) { Default(CALL(CORE2, DomainRangePop       , a      )); return domainData[a].pushPopDepth--; })
+    , DomainResourceCreate ([&](nvtxDomainHandle_t a, nvtxResourceAttributes_t*    b) { Default(CALL(CORE2, DomainResourceCreate , a, b   )); return PostInc(domainData[a].nextResourceHandle); })
+    , DomainResourceDestroy([&](nvtxResourceHandle_t                               a) { Default(CALL(CORE2, DomainResourceDestroy, a      )); })
+    , DomainNameCategoryA  ([&](nvtxDomainHandle_t a, uint32_t b, const char*      c) { Default(CALL(CORE2, DomainNameCategoryA  , a, b, c)); })
+    , DomainNameCategoryW  ([&](nvtxDomainHandle_t a, uint32_t b, const wchar_t*   c) { Default(CALL(CORE2, DomainNameCategoryW  , a, b, c)); })
+    , DomainRegisterStringA([&](nvtxDomainHandle_t a, const char*                  b) { Default(CALL(CORE2, DomainRegisterStringA, a, b   )); return PostInc(domainData[a].nextStringHandle); })
+    , DomainRegisterStringW([&](nvtxDomainHandle_t a, const wchar_t*               b) { Default(CALL(CORE2, DomainRegisterStringW, a, b   )); return PostInc(domainData[a].nextStringHandle); })
+    , DomainCreateA        ([&](const char*                                        a) { Default(CALL(CORE2, DomainCreateA        , a      )); return PostInc(nextDomainHandle); })
+    , DomainCreateW        ([&](const wchar_t*                                     a) { Default(CALL(CORE2, DomainCreateW        , a      )); return PostInc(nextDomainHandle); })
+    , DomainDestroy        ([&](nvtxDomainHandle_t                                 a) { Default(CALL(CORE2, DomainDestroy        , a      )); })
+    , Initialize           ([&](const void*                                        a) { Default(CALL(CORE2, Initialize           , a      )); })
+    {
+    }
+};
+
+extern Callbacks g_callbacks;
+
+
diff --git a/tests/TestCoverage.h b/tests/TestCoverage.h
new file mode 100644
index 0000000..bde609e
--- /dev/null
+++ b/tests/TestCoverage.h
@@ -0,0 +1,498 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <nvtx3/nvtx3.hpp>
+
+#include <iostream>
+#include <string>
+
+#include "PrettyPrintersNvtxCpp.h"
+
+struct a_lib
+{
+    static constexpr const char* name{"Library A"};
+    //static constexpr const float name{3.14f};
+};
+
+struct cat_x
+{
+    static constexpr const char* name{"Category X"};
+    static constexpr uint32_t id{42};
+};
+
+struct cat_y
+{
+    static constexpr const char* name{"Category Y"};
+    //static constexpr const float name{3.14f};
+    static constexpr uint32_t id{43};
+};
+
+struct regstr_hello
+{
+    static constexpr const char* message{"Hello"};
+};
+
+static void TestFuncRange()
+{
+    NVTX3_FUNC_RANGE();
+    nvtx3::mark("Marker in TestFuncRange");
+}
+
+static void TestFuncRangeV()
+{
+    NVTX3_V1_FUNC_RANGE();
+    nvtx3::mark("Marker in TestFuncRangeV");
+}
+
+static void TestFuncRangeIfDyn(bool cond)
+{
+    NVTX3_FUNC_RANGE_IF(cond);
+    nvtx3::mark("Marker in TestFuncRangeIfDyn");
+}
+
+static void TestFuncRangeIfDynV(bool cond)
+{
+    NVTX3_V1_FUNC_RANGE_IF(cond);
+    nvtx3::mark("Marker in TestFuncRangeIfDynV");
+}
+
+static void TestFuncRangeIfStat(bool cond)
+{
+    NVTX3_FUNC_RANGE_IF(cond);
+    nvtx3::mark("Marker in TestFuncRangeIfStat");
+}
+
+static void TestFuncRangeIfStatV(bool cond)
+{
+    NVTX3_V1_FUNC_RANGE_IF(cond);
+    nvtx3::mark("Marker in TestFuncRangeIfStatV");
+}
+
+static void TestFuncRangeIn()
+{
+    NVTX3_FUNC_RANGE_IN(a_lib);
+    nvtx3::mark("Marker in TestFuncRangeIn");
+}
+
+static void TestFuncRangeInV()
+{
+    NVTX3_V1_FUNC_RANGE_IN(a_lib);
+    nvtx3::mark("Marker in TestFuncRangeInV");
+}
+
+static void TestFuncRangeIfInDyn(bool cond)
+{
+    NVTX3_FUNC_RANGE_IF_IN(a_lib, cond);
+    nvtx3::mark("Marker in TestFuncRangeIfInDyn");
+}
+
+static void TestFuncRangeIfInDynV(bool cond)
+{
+    NVTX3_V1_FUNC_RANGE_IF_IN(a_lib, cond);
+    nvtx3::mark("Marker in TestFuncRangeIfInDynV");
+}
+
+static void TestFuncRangeIfInStat(bool cond)
+{
+    NVTX3_FUNC_RANGE_IF_IN(a_lib, cond);
+    nvtx3::mark("Marker in TestFuncRangeIfInStat");
+}
+
+static void TestFuncRangeIfInStatV(bool cond)
+{
+    NVTX3_V1_FUNC_RANGE_IF_IN(a_lib, cond);
+    nvtx3::mark("Marker in TestFuncRangeIfInStatV");
+}
+
+static int RunTestCommon(int argc, const char** argv)
+{
+    bool verbose = false;
+    const std::string verboseArg = "-v";
+    for (; *argv; ++argv)
+    {
+        if (*argv == verboseArg) verbose = true;
+    }
+
+    using namespace nvtx3;
+
+    {
+        std::cout << "Default attributes:\n";
+        event_attributes attr;
+        if (verbose) std::cout << attr << '\n';
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a message (ascii), payload, color, and category:\n";
+        event_attributes attr{
+            message{"Hello"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}};
+        if (verbose) std::cout << attr << '\n';
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a message with different string types:\n";
+
+        event_attributes a{message{"Hello"}};
+        if (verbose) std::cout << a << '\n';
+
+        event_attributes wa{message{L"Hello"}};
+        if (verbose) std::cout << wa << '\n';
+
+        std::string hello{"Hello"};
+        event_attributes b{message{hello}};
+        if (verbose) std::cout << b << '\n';
+
+        std::wstring whello{L"Hello"};
+        event_attributes wb{message{whello}};
+        if (verbose) std::cout << wb << '\n';
+
+        // Important!  Neither of following will compile:
+        //
+        //   event_attributes c{message{std::string{"foo"}}};
+        //   std::cout << c;
+        //
+        //   std::string foo{"foo"};
+        //   event_attributes d{message{hello + "bar"}};
+        //   std::cout << d;
+        //
+        // Both of those usages fail with:
+        // "error C2280: 'message::message(std::string &&)':
+        //  attempting to reference a deleted function"
+        //
+        // message is a "view" class, not an owning class.
+        // It cannot take ownership of a temporary string and
+        // destroy it when it goes out of scope.  Similarly,
+        // event_attributes is not an owning class, so it cannot take
+        // ownership of an message either.
+        //
+        // TODO:  Could we add implicit support for this?
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a message (registered):\n";
+        auto hTacobell = reinterpret_cast<nvtxStringHandle_t>(0x7ac0be11);
+        event_attributes attr{message{hTacobell}};
+        if (verbose) std::cout << attr << '\n';
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Convenience: Set a message without the helper type:\n";
+
+        event_attributes a{"Hello"};
+        if (verbose) std::cout << a << '\n';
+
+        std::string hello{"Hello"};
+        event_attributes b{hello};
+        if (verbose) std::cout << b << '\n';
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a payload twice (first should win):\n";
+        event_attributes attr{"test", payload{1.0f}, payload{2}};
+        if (verbose) std::cout << attr << '\n';
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a color twice (first should win):\n";
+        event_attributes attr{"test", argb{127,0,0,255}, rgb{0,255,0}};
+        if (verbose) std::cout << attr << '\n';
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a message twice (first should win):\n";
+        event_attributes attr{L"wide", "narrow"};
+        if (verbose) std::cout << attr << '\n';
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Set a category twice (first should win):\n";
+        event_attributes attr{"test", category{1}, category{2}};
+        if (verbose) std::cout << attr << '\n';
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Markers\n";
+
+        // Global domain
+        event_attributes attr{
+            message{"Hello1"},
+            category{11},
+            payload{5.0f},
+            rgb{1,2,3}};
+        mark(attr);
+
+        mark(event_attributes{
+            message{"Hello2"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}});
+
+        mark(
+            message{"Hello3"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0});
+
+        // a_lib domain
+        event_attributes a_attr{
+            message{"a: Hello1"},
+            category{11},
+            payload{5.0f},
+            rgb{1,2,3}};
+        mark_in<a_lib>(attr);
+
+        mark_in<a_lib>(event_attributes{
+            message{"a: Hello2"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}});
+
+        mark_in<a_lib>(
+            message{"a: Hello3"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0});
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Range start/end and range_handle\n";
+
+        // Global domain
+        event_attributes attr{
+            message{"Hello1"},
+            category{11},
+            payload{5.0f},
+            rgb{1,2,3}};
+        auto h1 = start_range(attr);
+
+        auto h2 = start_range(event_attributes{
+            message{"Hello2"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}});
+
+        auto h3 = start_range(
+            message{"Hello3"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0});
+
+        // a_lib domain
+        event_attributes a_attr{
+            message{"a: Hello1"},
+            category{11},
+            payload{5.0f},
+            rgb{1,2,3}};
+        auto h4 = start_range_in<a_lib>(attr);
+
+        auto h5 = start_range_in<a_lib>(event_attributes{
+            message{"a: Hello2"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}});
+
+        auto h6 = start_range_in<a_lib>(
+            message{"a: Hello3"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0});
+
+        // range_handle operator ==, !=, and cast overloads
+        bool testEq = h1 == h2;
+        bool testNe = h3 != h4;
+        bool testCast = bool(h5);
+        if (verbose) std::cout << std::boolalpha
+            << testEq << "\n"
+            << testNe << "\n"
+            << testCast << "\n";
+
+        end_range(h1);
+        end_range(h2);
+        end_range(h3);
+
+        end_range_in<a_lib>(h4);
+        end_range_in<a_lib>(h5);
+        end_range_in<a_lib>(h6);
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "unique_range\n";
+
+        // Global domain
+        event_attributes attr{
+            message{"Hello1"},
+            category{11},
+            payload{5.0f},
+            rgb{1,2,3}};
+        unique_range u1{attr};
+
+        unique_range u2{event_attributes{
+            message{"Hello2"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}}};
+
+        unique_range u3{
+            message{"Hello3"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}};
+
+        // a_lib domain
+        event_attributes a_attr{
+            message{"a: Hello1"},
+            category{11},
+            payload{5.0f},
+            rgb{1,2,3}};
+        unique_range_in<a_lib> u4{attr};
+
+        unique_range_in<a_lib> u5{event_attributes{
+            message{"a: Hello2"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}}};
+
+        unique_range_in<a_lib> u6{
+            message{"a: Hello3"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}};
+
+        // movability
+        auto move_in_out_global = [](unique_range           u) { return u; };
+        auto move_in_out_domain = [](unique_range_in<a_lib> u) { return u; };
+
+        auto u1moved = move_in_out_global(std::move(u1));
+        auto u4moved = move_in_out_domain(std::move(u4));
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "scoped_range\n";
+
+        // Global domain
+        event_attributes attr{
+            message{"Hello1"},
+            category{11},
+            payload{5.0f},
+            rgb{1,2,3}};
+        scoped_range s1{attr};
+
+        scoped_range s2{event_attributes{
+            message{"Hello2"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}}};
+
+        scoped_range s3{
+            message{"Hello3"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}};
+
+        // a_lib domain
+        event_attributes a_attr{
+            message{"a: Hello1"},
+            category{11},
+            payload{5.0f},
+            rgb{1,2,3}};
+        scoped_range_in<a_lib> s4{attr};
+
+        scoped_range_in<a_lib> s5{event_attributes{
+            message{"a: Hello2"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}}};
+
+        scoped_range_in<a_lib> s6{
+            message{"a: Hello3"},
+            category{11},
+            payload{5.0f},
+            rgb{0,255,0}};
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "named_category\n";
+
+        // Global domain
+        mark                   ("Cat", named_category::get                   <cat_x>());
+        mark_in<>              ("Cat", named_category_in<>::get              <cat_x>());
+        mark_in<domain::global>("Cat", named_category_in<domain::global>::get<cat_x>());
+
+        // a_lib domain
+        mark_in<a_lib>("Cat", named_category_in<a_lib>::get<cat_y>());
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "registered_string\n";
+
+        // Global domain
+        mark                   ("RegStr", registered_string::get                   <regstr_hello>());
+        mark_in<>              ("RegStr", registered_string_in<>::get              <regstr_hello>());
+        mark_in<domain::global>("RegStr", registered_string_in<domain::global>::get<regstr_hello>());
+
+        // a_lib domain
+        mark_in<a_lib>("RegStr", registered_string_in<a_lib>::get<regstr_hello>());
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    {
+        std::cout << "Macros:\n";
+        TestFuncRange();
+        TestFuncRangeV();
+        TestFuncRangeIfDyn(argc == 1001);
+        TestFuncRangeIfDyn(argc != 1001);
+        TestFuncRangeIfDynV(argc == 1002);
+        TestFuncRangeIfDynV(argc != 1002);
+        TestFuncRangeIfStat(true);
+        TestFuncRangeIfStat(false);
+        TestFuncRangeIfStatV(true);
+        TestFuncRangeIfStatV(false);
+
+        TestFuncRangeIn();
+        TestFuncRangeInV();
+        TestFuncRangeIfInDyn(argc == 1003);
+        TestFuncRangeIfInDyn(argc != 1003);
+        TestFuncRangeIfInDynV(argc == 1004);
+        TestFuncRangeIfInDynV(argc != 1004);
+        TestFuncRangeIfInStat(true);
+        TestFuncRangeIfInStat(false);
+        TestFuncRangeIfInStatV(true);
+        TestFuncRangeIfInStatV(false);
+    }
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    return 0;
+}
diff --git a/tests/TestSelfInjection.cpp b/tests/TestSelfInjection.cpp
new file mode 100644
index 0000000..4e42166
--- /dev/null
+++ b/tests/TestSelfInjection.cpp
@@ -0,0 +1,271 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include <nvtx3/nvToolsExt.h>
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "SelfInjection.h"
+
+struct S1
+{
+    int i;
+    float f;
+};
+bool operator==(S1 const& lhs, S1 const& rhs)
+{
+    return lhs.i == rhs.i && lhs.f == rhs.f;
+}
+std::ostream& operator<<(std::ostream& lhs, S1 const& rhs)
+{
+    return lhs << '{' << rhs.i << ',' << rhs.f << '}';
+}
+
+struct S2
+{
+    int i;
+    float f;
+    const char* s;
+};
+
+static bool Same(S2 const& lhs, S2 const& rhs, SAME_COMMON_ARGS)
+{
+    bool same =
+        Same(lhs.i, rhs.i, deep, verbose, "i", oss, depth + 1) &&
+        Same(lhs.f, rhs.f, deep, verbose, "f", oss, depth + 1) &&
+        Same(lhs.s, rhs.s, deep, verbose, "s", oss, depth + 1);
+    if (verbose && !same) oss << std::string(depth, ' ') << "'" << name << "' members different\n";
+    return same;
+}
+
+static bool TestSame(bool verbose, bool deep)
+{
+    std::cout << std::boolalpha;
+
+    std::cout << "--- Simple ints:\n";
+    {
+        int xL = 5, xR = 5;
+        bool result = Same(xL, xR, deep, verbose, "x");
+        std::cout << "> == ints: " << result << '\n';
+    }
+    {
+        int xL = 5, xR = 6;
+        bool result = Same(xL, xR, deep, verbose, "x");
+        std::cout << "> != ints: " << result << '\n';
+    }
+
+    std::cout << "--- C-style strings:\n";
+    {
+        const char* str = "String";
+        bool result = Same(str, str, deep, verbose, "str");
+        std::cout << "> char string w/itself: " << result << '\n';
+    }
+    {
+        const char* strL = "String";
+        const char* strR = "String";
+        bool result = Same(strL, strR, deep, verbose, "str");
+        std::cout << "> == char strings: " << result << '\n';
+    }
+    {
+        const char* strL = "StringA";
+        const char* strR = "StringB";
+        bool result = Same(strL, strR, deep, verbose, "str");
+        std::cout << "> != char strings: " << result << '\n';
+    }
+
+    std::cout << "--- Structs with == and << operators:\n";
+    {
+        S1 sL{5, 3.125f};
+        S1 sR{5, 3.125f};
+        bool result = Same(sL, sR, deep, verbose, "S1");
+        std::cout << "> == S1s: " << result << '\n';
+    }
+    {
+        S1 sL{5, 3.125f};
+        S1 sR{5, 3.14159f};
+        bool result = Same(sL, sR, deep, verbose, "S1");
+        std::cout << "> != S1s: " << result << '\n';
+    }
+
+    std::cout << "--- Pointers to structs with == and << operators:\n";
+    {
+        S1 sL{5, 3.125f};
+        S1* psL = &sL;
+        bool result = Same(psL, psL, deep, verbose, "S1 ptr");
+        std::cout << "> same ptr to an S1: " << result << '\n';
+    }
+    {
+        S1 sL{5, 3.125f};
+        S1 sR{5, 3.125f};
+        S1* psL = &sL;
+        S1* psR = &sR;
+        bool result = Same(psL, psR, deep, verbose, "S1 ptr");
+        std::cout << "> different ptrs to == S1s: " << result << '\n';
+    }
+    {
+        S1 sL{5, 3.125f};
+        S1 sR{5, 3.14159f};
+        S1* psL = &sL;
+        S1* psR = &sR;
+        bool result = Same(psL, psR, deep, verbose, "S1 ptr");
+        std::cout << "> different ptrs to != S1s: " << result << '\n';
+    }
+
+    std::cout << "--- Structs with Same function defined:\n";
+    {
+        S2 sL{5, 3.125f, "An S2"};
+        S2 sR{5, 3.125f, "An S2"};
+        bool result = Same(sL, sR, deep, verbose, "S2");
+        std::cout << "> == S2s: " << result << '\n';
+    }
+    {
+        S2 sL{5, 3.125f, "An S2"};
+        S2 sR{5, 3.14159f, "An S2"};
+        bool result = Same(sL, sR, deep, verbose, "S2");
+        std::cout << "> !=f in S2s: " << result << '\n';
+    }
+    {
+        S2 sL{5, 3.125f, "An S2"};
+        S2 sR{5, 3.125f, "Another S2"};
+        bool result = Same(sL, sR, deep, verbose, "S2");
+        std::cout << "> !=s in S2s: " << result << '\n';
+    }
+
+    std::cout << "--- NVTX handles - pointers to incomplete types:\n";
+    {
+        auto hL = reinterpret_cast<nvtxDomainHandle_t>(1024);
+        auto hR = reinterpret_cast<nvtxDomainHandle_t>(1024);
+        bool result = Same(hL, hR, deep, verbose, "nvtxDomainHandle_t");
+        std::cout << "> == domain handles: " << result << '\n';
+    }
+    {
+        auto hL = reinterpret_cast<nvtxDomainHandle_t>(1024);
+        auto hR = reinterpret_cast<nvtxDomainHandle_t>(2048);
+        bool result = Same(hL, hR, deep, verbose, "nvtxDomainHandle_t");
+        std::cout << "> != domain handles: " << result << '\n';
+    }
+
+    std::cout << "--- NVTX event attributes - struct with tagged union:\n";
+    {
+        char buf1[]{"Test message"};
+        char buf2[]{"Test message"};
+
+        nvtxEventAttributes_t aL{};
+        aL.version = NVTX_VERSION;
+        aL.size = sizeof(nvtxEventAttributes_t);
+        aL.category = 5;
+        aL.colorType = NVTX_COLOR_ARGB;
+        aL.color = 0xFF446688;
+        aL.payloadType = NVTX_PAYLOAD_TYPE_DOUBLE;
+        aL.payload.dValue = 3.125;
+        aL.messageType = NVTX_MESSAGE_TYPE_ASCII;
+        aL.message.ascii = buf1;
+        aL.reserved0 = 1;
+
+        auto aR = aL;
+
+        auto* paL = &aL;
+        auto* paR = &aR;
+
+        bool result = Same(aL, aR, deep, verbose, "nvtxEventAttributes_t");
+        std::cout << "> == attrs: " << result << '\n';
+
+        aR = aL;
+        aR.reserved0 = 2;
+        result = Same(aL, aR, deep, verbose, "nvtxEventAttributes_t");
+        std::cout << "> == attrs with different padding: " << result << '\n';
+
+        aR = aL;
+        aR.category = 6;
+        result = Same(aL, aR, deep, verbose, "nvtxEventAttributes_t");
+        std::cout << "> != attrs, category: " << result << '\n';
+
+        aR = aL;
+        aR.message.ascii = buf2;
+        result = Same(aL, aR, deep, verbose, "nvtxEventAttributes_t");
+        std::cout << "> == attrs with same message in different buffers: " << result << '\n';
+
+        aR = aL;
+        aR.message.ascii = "Different message";
+        result = Same(aL, aR, deep, verbose, "nvtxEventAttributes_t");
+        std::cout << "> != attrs, message: " << result << '\n';
+
+        aR = aL;
+        aR.payloadType = NVTX_PAYLOAD_TYPE_FLOAT;
+        result = Same(aL, aR, deep, verbose, "nvtxEventAttributes_t");
+        std::cout << "> != attrs, payloadType: " << result << '\n';
+
+        aR = aL;
+        aR.payload.dValue = -3.125;
+        result = Same(aL, aR, deep, verbose, "nvtxEventAttributes_t");
+        std::cout << "> != attrs, payload union value: " << result << '\n';
+
+        aR = aL;
+        result = Same(paL, paL, deep, verbose, "nvtxEventAttributes_t by pointer");
+        std::cout << "> == attr pointers: " << result << '\n';
+
+        result = Same(paL, paR, deep, verbose, "nvtxEventAttributes_t by pointer");
+        std::cout << "> == attr values, different pointers: " << result << '\n';
+
+        aR.payload.dValue = -3.125;
+        result = Same(paL, paR, deep, verbose, "nvtxEventAttributes_t by pointer");
+        std::cout << "> != attr values, payload union value: " << result << '\n';
+    }
+
+    return true;
+}
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    // Always verbose -- tests both verbose and non-verbose modes
+
+    {
+        std::cout << "\n------- Non-verbose, non-deep:\n";
+        bool success = TestSame(false, false);
+        if (!success) { std::cout << "TestSame returned false!\n"; return 1; }
+    }
+    {
+        std::cout << "\n------- Non-verbose, deep:\n";
+        bool success = TestSame(false, true);
+        if (!success) { std::cout << "TestSame returned false!\n"; return 1; }
+    }
+
+    {
+        std::cout << "\n------- Verbose, non-deep:\n";
+        bool success = TestSame(true, false);
+        if (!success) { std::cout << "TestSame returned false!\n"; return 1; }
+    }
+    {
+        std::cout << "\n------- Verbose, deep:\n";
+        bool success = TestSame(true, true);
+        if (!success) { std::cout << "TestSame returned false!\n"; return 1; }
+    }
+
+    std::cout << "\n--------- Success!\n";
+    return 0;
+}
diff --git a/tests/UseExportedApi.cpp b/tests/UseExportedApi.cpp
new file mode 100644
index 0000000..474c68f
--- /dev/null
+++ b/tests/UseExportedApi.cpp
@@ -0,0 +1,210 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed under the Apache License v2.0 with LLVM Exceptions.
+ * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
+ */
+
+#include "PathHelper.h"
+
+#include <nvtx3/nvToolsExt.h>
+#include <nvtx3/nvToolsExtCuda.h>
+#include <nvtx3/nvToolsExtCudaRt.h>
+#include <nvtx3/nvToolsExtOpenCL.h>
+#include <nvtx3/nvToolsExtSync.h>
+
+#include <iostream>
+#include <string>
+
+// Use an X-macro to allow doing arbitrary operations to all exported API functions.
+// An easy way to generate this list is to use Linux and dump the exports from libexport-api.so.
+// I recommend having a bash script to do this, e.g. "exports":
+//
+//    #!/bin/bash
+//    nm -D "$@" | perl -ne 'print if s/^\S+ T //'
+//
+// Then typing "exports libexport-api.so" will dump a plain list of the exported symbols.
+// That can be piped into perl or sed again to add the X-macro stuff, e.g.:
+//
+//    exports libexport-api.so | perl -ne 'chomp; print "    func($_) \\\n"'
+//
+// Running that command would produce the exact text you can use for the implementation of this
+// macro.  Don't forget to leave at least one blank line after the macro so the backslash on the
+// last line doesn't connect the macro to the next line of code afterwards.
+//
+// Double-check when generating the list of exports from libexport-api.so that it does in fact
+// contain the expected number of exported functions!!!  If you automate generating this macro
+// as part of the build, then failure to export some symbols would result in failure to include
+// them in this list of symbols to test!
+//
+#define FOR_EACH_EXPORT(func) \
+    func(nvtxDomainCreateA) \
+    func(nvtxDomainCreateW) \
+    func(nvtxDomainDestroy) \
+    func(nvtxDomainMarkEx) \
+    func(nvtxDomainNameCategoryA) \
+    func(nvtxDomainNameCategoryW) \
+    func(nvtxDomainRangeEnd) \
+    func(nvtxDomainRangePop) \
+    func(nvtxDomainRangePushEx) \
+    func(nvtxDomainRangeStartEx) \
+    func(nvtxDomainRegisterStringA) \
+    func(nvtxDomainRegisterStringW) \
+    func(nvtxDomainResourceCreate) \
+    func(nvtxDomainResourceDestroy) \
+    func(nvtxDomainSyncUserAcquireFailed) \
+    func(nvtxDomainSyncUserAcquireStart) \
+    func(nvtxDomainSyncUserAcquireSuccess) \
+    func(nvtxDomainSyncUserCreate) \
+    func(nvtxDomainSyncUserDestroy) \
+    func(nvtxDomainSyncUserReleasing) \
+    func(nvtxInitialize) \
+    func(nvtxMarkA) \
+    func(nvtxMarkEx) \
+    func(nvtxMarkW) \
+    func(nvtxNameCategoryA) \
+    func(nvtxNameCategoryW) \
+    func(nvtxNameClCommandQueueA) \
+    func(nvtxNameClCommandQueueW) \
+    func(nvtxNameClContextA) \
+    func(nvtxNameClContextW) \
+    func(nvtxNameClDeviceA) \
+    func(nvtxNameClDeviceW) \
+    func(nvtxNameClEventA) \
+    func(nvtxNameClEventW) \
+    func(nvtxNameClMemObjectA) \
+    func(nvtxNameClMemObjectW) \
+    func(nvtxNameClProgramA) \
+    func(nvtxNameClProgramW) \
+    func(nvtxNameClSamplerA) \
+    func(nvtxNameClSamplerW) \
+    func(nvtxNameCuContextA) \
+    func(nvtxNameCuContextW) \
+    func(nvtxNameCuDeviceA) \
+    func(nvtxNameCuDeviceW) \
+    func(nvtxNameCuEventA) \
+    func(nvtxNameCuEventW) \
+    func(nvtxNameCuStreamA) \
+    func(nvtxNameCuStreamW) \
+    func(nvtxNameCudaDeviceA) \
+    func(nvtxNameCudaDeviceW) \
+    func(nvtxNameCudaEventA) \
+    func(nvtxNameCudaEventW) \
+    func(nvtxNameCudaStreamA) \
+    func(nvtxNameCudaStreamW) \
+    func(nvtxNameOsThreadA) \
+    func(nvtxNameOsThreadW) \
+    func(nvtxRangeEnd) \
+    func(nvtxRangePop) \
+    func(nvtxRangePushA) \
+    func(nvtxRangePushEx) \
+    func(nvtxRangePushW) \
+    func(nvtxRangeStartA) \
+    func(nvtxRangeStartEx) \
+    func(nvtxRangeStartW) \
+
+// ^ Above line must be left blank, since last line of macro ends with a backslash
+
+template <typename FnPtr>
+FnPtr GetExport(
+    DLL_HANDLE hDll,
+    const char* fnName,
+    std::vector<const char*>& found,
+    std::vector<const char*>& missing)
+{
+    FnPtr pfn = (FnPtr)GET_DLL_FUNC(hDll, fnName);
+    if (pfn)
+    {
+        found.push_back(fnName);
+    }
+    else
+    {
+        missing.push_back(fnName);
+    }
+    return pfn;
+}
+
+#define DEFINE_AND_GET_FN_PTR_FOR_EXPORT(fn) \
+    auto pfn_##fn = GetExport<decltype(&fn)>(hDll, #fn, foundFuncs, missingFuncs);
+
+extern "C" NVTX_DYNAMIC_EXPORT
+int RunTest(int argc, const char** argv)
+{
+    NVTX_EXPORT_UNMANGLED_FUNCTION_NAME
+
+    bool verbose = false;
+    const std::string verboseArg = "-v";
+    for (; *argv; ++argv)
+    {
+        if (*argv == verboseArg) verbose = true;
+    }
+
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    // Construct abs path to export-api library
+    std::string exportApiLib = AbsolutePathToLibraryInCurrentProcessPath("export-api");
+
+    // Load export-api library
+    DLL_HANDLE hDll = DLL_OPEN(exportApiLib.c_str());
+    if (!hDll) return 201;
+
+    std::vector<const char*> foundFuncs, missingFuncs;
+
+    // For each export, try to GET_DLL_FUNC for it
+    //     - Don't early-out, print list of all failed exports
+
+    //auto pfn_nvtxMarkA = GetExport<decltype(&nvtxMarkA)>(hDll, "nvtxMarkA", foundFuncs, missingFuncs);
+    //auto pfn_nvtxDomainCreateA = GetExport<decltype(&nvtxDomainCreateA)>(hDll, "nvtxDomainCreateA", foundFuncs, missingFuncs);
+    // ...
+    FOR_EACH_EXPORT(DEFINE_AND_GET_FN_PTR_FOR_EXPORT)
+
+    if (verbose) std::cout << " - Got non-zero pointers for " << foundFuncs.size() << " NVTX functions.\n";
+
+    if (verbose) std::cout << " - Trying to call some NVTX functions through the exports...\n";
+
+    // For a few simple functions, try calling them through function pointers with
+    // harmless args.  If the calling conventions are wrong, these calls will crash.
+    // If they are working, the NVTX injection should load and print something.
+    if (pfn_nvtxMarkA)
+    {
+        pfn_nvtxMarkA("Testing nvtxMarkA");
+    }
+
+    if (pfn_nvtxDomainCreateA)
+    {
+        auto hDomain = pfn_nvtxDomainCreateA("Testing nvtxDomainCreateA");
+        (void)hDomain;
+    }
+
+    if (verbose) std::cout << " - Survived calling NVTX functions.\n";
+
+    if (!missingFuncs.empty())
+    {
+        if (verbose)
+        {
+            std::cout << "Missing exports:\n";
+            for (auto fnName : missingFuncs)
+            {
+                std::cout << "    " << fnName << "\n";
+            }
+        }
+        return 202;
+    }
+
+    if (verbose) std::cout << "-------------------------------------\n";
+
+    return 0;
+}
diff --git a/tests/mingw-w64-x86_64.cmake b/tests/mingw-w64-x86_64.cmake
new file mode 100644
index 0000000..39c3be5
--- /dev/null
+++ b/tests/mingw-w64-x86_64.cmake
@@ -0,0 +1,15 @@
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+
+set(TOOLCHAIN_PREFIX x86_64-w64-mingw32)
+
+set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}-gcc)
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}-g++)
+set(CMAKE_Fortran_COMPILER ${TOOLCHAIN_PREFIX}-gfortran)
+set(CMAKE_RC_COMPILER ${TOOLCHAIN_PREFIX}-windres)
+
+set(CMAKE_FIND_ROOT_PATH /usr/${TOOLCHAIN_PREFIX})
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)