diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4ae16c..f6603ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,11 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+set(DEEPEERT_VERSION_MAJOR 0)
+set(DEEPEERT_VERSION_MINOR 3)
+set(DEEPEERT_VERSION_PATCH 2)
+set(DEEPEERT_VERSION ${DEEPEERT_VERSION_MAJOR}.${DEEPEERT_VERSION_MINOR}.${DEEPEERT_VERSION_PATCH})
+  
 set(CMAKE_BUILD_TYPE_INIT "Release")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_CXX_STANDARD 20)
@@ -8,7 +13,7 @@ set(CMAKE_CUDA_ARCHITECTURES native)
 
 cmake_minimum_required(VERSION 3.16)
 cmake_policy(SET CMP0048 NEW)
-project(DeePeeRT VERSION 0.1.0 LANGUAGES C CXX CUDA)
+project(deepeeRT VERSION ${DEEPEERT_VERSION} LANGUAGES C CXX CUDA)
 
 if(NOT SET_UP_CONFIGURATIONS_DONE)
   set(SET_UP_CONFIGURATIONS_DONE 1)
@@ -33,14 +38,91 @@ if ((${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR}))
   SET(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR})
 endif()
 
-option(DEEPEE_USE_LOCAL_CUBQL "Use *local* cuBQL" OFF)
-mark_as_advanced(DEEPEE_USE_LOCAL_CUBQL)
-if (DEEPEE_USE_LOCAL_CUBQL)
+option(DEEPEERT_USE_LOCAL_CUBQL "Use *local* cuBQL" OFF)
+mark_as_advanced(DEEPEERT_USE_LOCAL_CUBQL)
+if (DEEPEERT_USE_LOCAL_CUBQL)
   add_subdirectory(../cuBQL EXCLUDE_FROM_ALL builddir_cuBQL)
 else()
   add_subdirectory(submodules/cuBQL EXCLUDE_FROM_ALL builddir_cuBQL)
 endif()
 
 add_subdirectory(dp)
-add_subdirectory(miniapp)
+
+if (NOT (${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR}))
+  # don't build miniapp if this is a subproject
+else()
+#  add_subdirectory(miniapp)
+endif()
+
+
+# ==================================================================
+# install target(s)
+# ==================================================================
+
+include(GNUInstallDirs)
+
+set(DEEPEERT_CMAKE_INSTALL_DESTINATION
+  ${CMAKE_INSTALL_LIBDIR}/cmake/deepeeRT-${DEEPEERT_VERSION}#${PROJECT_VERSION}
+)
+
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file(
+  "${PROJECT_SOURCE_DIR}/dp/deepeeRTConfig.cmake.in"
+  "${PROJECT_BINARY_DIR}/deepeeRTConfig.cmake"
+  INSTALL_DESTINATION
+  ${DEEPEERT_CMAKE_INSTALL_DESTINATION}
+)
+
+message("DEEPEERT_VERSION ${DEEPEERT_VERSION}")
+write_basic_package_version_file(
+    "deepeeRTConfigVersion.cmake"
+    VERSION ${DEEPEERT_VERSION}
+    COMPATIBILITY SameMajorVersion
+)
+
+install(FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/deepeeRTConfig.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/deepeeRTConfigVersion.cmake
+  DESTINATION
+  ${DEEPEERT_CMAKE_INSTALL_DESTINATION}
+  )
+
+
+install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/cmake
+  DESTINATION
+    ${DEEPEERT_CMAKE_INSTALL_DESTINATION}
+  FILES_MATCHING
+    PATTERN *.cmake
+    PATTERN FinddeepeeRT.cmake EXCLUDE
+)
+
+install(
+  TARGETS
+  deepeeRT
+#  EXPORT deepeeRT
+  EXPORT deepeeRT-config
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  NAMELINK_SKIP
+  # on Windows put the dlls into bin
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  # ... and the import lib into the devel package
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+install(EXPORT deepeeRT-config
+#install(EXPORT deepeeRT
+  DESTINATION ${DEEPEERT_CMAKE_INSTALL_DESTINATION}
+  NAMESPACE deepeeRT::
+  )
+
+# configures the public api
+configure_file(
+  ${PROJECT_SOURCE_DIR}/include/deepeeRT/deepeeRT.in.h
+  include/deepeeRT/deepeeRT.h
+)
+
+install(
+  DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/deepeeRT
+  DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}
+)
 
diff --git a/dp/Backend.cpp b/dp/Backend.cpp
index 1b8eb7d..cc258c3 100644
--- a/dp/Backend.cpp
+++ b/dp/Backend.cpp
@@ -5,10 +5,4 @@
 #include "dp/Context.h"
 
 namespace dp {
-
-  Backend::Backend(Context *const context)
-    : context(context),
-      gpuID(context->gpuID)
-  {}
-
 } // ::dp
diff --git a/dp/Backend.h b/dp/Backend.h
index c27d0fe..27a8c58 100644
--- a/dp/Backend.h
+++ b/dp/Backend.h
@@ -10,7 +10,9 @@ namespace dp {
   struct TrianglesDPGroup;
   struct InstancesDPGroup;
   
-  /*! CAREFUL: this HAS to match the data layout of DPRRay in deepee.h !*/
+  /*! Internal representation of a ray in the input ray
+      queue. CAREFUL: this HAS to match the data layout of DPRRay in
+      deepee.h !*/
   struct Ray {
     vec3d origin;
     vec3d direction;
@@ -18,14 +20,18 @@ namespace dp {
     double  tMax;
   };
 
-  /*! CAREFUL: this HAS to match the data layout of DPRHit in deepee.h !*/
+  /*! Internal representation of a hit in the hit queue to be traced
+      against. CAREFUL: this HAS to match the data layout of DPRHit in
+      deepee.h !*/
   struct Hit {
     /*! index of prim within the geometry it was created in. A value of
       '-1' means 'no hit' */
     int     primID;
+    
     /* index of the instance that contained the hit point. Undefined if
        on hit occurred */
     int     instID;
+    
     /*! user-supplied geom ID (the one specified during geometry create
       call) for the geometry that contained the hit. Unlike primID and
       instID this is *not* a linear ID, but whatever int64 value the
@@ -35,41 +41,6 @@ namespace dp {
     double  u, v;
   };
 
-  /*! implements a group of double-precision triangles */
-  struct TrianglesDPImpl {
-    TrianglesDPImpl(TrianglesDPGroup *const fe) : fe(fe) {}
-    virtual ~TrianglesDPImpl() = default;
-    TrianglesDPGroup *const fe;
-  };
-    
-  /*! implements a group of double-precision instances, including the
-    actual trace() method */
-  struct InstancesDPImpl {
-    InstancesDPImpl(InstancesDPGroup *const fe) : fe(fe) {}
-    virtual ~InstancesDPImpl() = default;
-
-    virtual void trace(Ray *rays,
-                       Hit *hits,
-                       int numRays) = 0;
-    
-    InstancesDPGroup *const fe;
-  };
 
-  /*! implements an actual backend for a double-precision ray tracing
-      context. primarily acts as 'factory' for instance and geometry
-      groups that then do the actual work */
-  struct Backend {
-    Backend(Context *const context);
-    virtual ~Backend() = default;
-    
-    virtual std::shared_ptr<InstancesDPImpl>
-    createInstancesDPImpl(dp::InstancesDPGroup *fe) = 0;
-    
-    virtual std::shared_ptr<TrianglesDPImpl>
-    createTrianglesDPImpl(dp::TrianglesDPGroup *fe) = 0;
-    
-    Context *const context;
-    int const gpuID;
-  };
-  
+  static Context *createBackend(int gpuID);
 } // ::dp
diff --git a/dp/CMakeLists.txt b/dp/CMakeLists.txt
index 5708f61..62df336 100644
--- a/dp/CMakeLists.txt
+++ b/dp/CMakeLists.txt
@@ -3,36 +3,52 @@
 
 find_package(CUDAToolkit REQUIRED)
 
-add_library(deepeeRT_common STATIC
-  Backend.h
-  Backend.cpp
+add_library(deepeeRT_common INTERFACE#STATIC
 )
-target_include_directories(deepeeRT_common PUBLIC
-  ${PROJECT_SOURCE_DIR}
-  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+target_include_directories(deepeeRT_common INTERFACE
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
+  $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}>/include
+#  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
 )
-target_link_libraries(deepeeRT_common PUBLIC
+target_link_libraries(deepeeRT_common INTERFACE#PUBLIC
   cuBQL
   CUDA::cuda_driver
   CUDA::cudart_static
 )
+target_compile_definitions(deepeeRT_common INTERFACE#PUBLIC
+  -DdeepeeRT_STATIC=1)
 
 add_library(deepeeRT STATIC
-  ../include/deepee/deepee.h
+  ../include/deepeeRT/deepeeRT.h
+  Backend.h
+  Backend.cpp
   common.h
   Triangles.h
-  Triangles.cu
+  Triangles.cpp
   World.h
-  World.cu
+  World.cpp
+  Group.h
+  Group.cpp 
   Context.h
   Context.cpp
-  Group.h
-  Group.cu
   deepeeRT.cpp
 )
+target_include_directories(deepeeRT PUBLIC
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  )
 
 add_subdirectory(cuBQL)
-target_link_libraries(deepeeRT PUBLIC
-  deepeeRT_common
-  deepeeRT_cuBQL
+target_link_libraries(deepeeRT PRIVATE#PUBLIC
+  $<BUILD_INTERFACE:deepeeRT_cuBQL>
+  $<BUILD_INTERFACE:deepeeRT_common>
+)
+
+set_target_properties(deepeeRT
+  PROPERTIES
+  CXX_VISIBILITY_PRESET  hidden
+  CUDA_VISIBILITY_PRESET hidden
+  CUDA_SEPARABLE_COMPILATION   ON
+  POSITION_INDEPENDENT_CODE    ON
+  CUDA_USE_STATIC_CUDA_RUNTIME ON
+  CUDA_RESOLVE_DEVICE_SYMBOLS  ON
 )
diff --git a/dp/Context.cpp b/dp/Context.cpp
index 27bc978..b62eaf8 100644
--- a/dp/Context.cpp
+++ b/dp/Context.cpp
@@ -8,9 +8,7 @@ namespace dp {
   
   Context::Context(int gpuID)
     : gpuID(gpuID)
-  {
-    backend = std::make_shared<CuBQLCUDABackend>(this);
-  }
-  
+  {}
+
 } // ::dp
 
diff --git a/dp/Context.h b/dp/Context.h
index 5605bb5..9a13515 100644
--- a/dp/Context.h
+++ b/dp/Context.h
@@ -5,13 +5,42 @@
 
 #include "dp/common.h"
 #include "dp/Backend.h"
+#include <memory>
 
 namespace dp {
+
+  struct InstanceGroup;
+  struct TriangleMesh;
+  struct TrianglesGroup;
   
   struct Context {
+    static Context *create(int gpuID);
+    
     Context(int gpuID);
 
-    std::shared_ptr<Backend> backend;
+    /*! creates a 'world' as a grouping of triangle mesh groups, with
+        associated object-to-world space instance
+        transforms. Implements the dprTrace() API function */
+    virtual dp::InstanceGroup *
+    createInstanceGroup(const std::vector<dp::TrianglesGroup *> &groups,
+                        const DPRAffine *transforms) = 0;
+
+    /*! creates an object that represents a single triangle
+        mesh. implements `dprCreateTrianglesDP()` */
+    virtual dp::TriangleMesh *
+    createTriangleMesh(uint64_t         userData,
+                       const vec3d     *vertexArray,
+                       int              vertexCount,
+                       const vec3i     *indexArray,
+                       int              indexCount) = 0;
+    
+    /*! creates an object that represents a group of multiple triangle
+        meshes that can then get instantiated. implements
+        `dprCreateTrianglesGroup()` */
+    virtual dp::TrianglesGroup *
+    createTrianglesGroup(const std::vector<dp::TriangleMesh *> &geoms) = 0;
+    
+    
     /*! the cuda gpu ID that this device is going to run on */
     int const gpuID;
   };
diff --git a/dp/Group.cpp b/dp/Group.cpp
new file mode 100644
index 0000000..5ded672
--- /dev/null
+++ b/dp/Group.cpp
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dp/Group.h"
+#include "dp/Context.h"
+
+namespace dp {
+  
+  Group::Group(Context *const context)
+    : context(context)
+  {}
+  
+} // ::dp
+
diff --git a/dp/Group.cu b/dp/Group.cu
deleted file mode 100644
index d1a35a4..0000000
--- a/dp/Group.cu
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "dp/Group.h"
-#include "dp/Context.h"
-
-namespace dp {
-
-  TrianglesDPGroup::TrianglesDPGroup(Context *context,
-                                     const std::vector<TrianglesDP *> &geoms)
-    : context(context),
-      geoms(geoms)
-  {
-    impl = context->backend->createTrianglesDPImpl(this);
-  }
-
-} // ::dp
-
diff --git a/dp/Group.h b/dp/Group.h
index 78d63fa..47b6808 100644
--- a/dp/Group.h
+++ b/dp/Group.h
@@ -3,31 +3,22 @@
 
 #pragma once
 
-#include "dp/Triangles.h"
+#include "dp/common.h"
 
 namespace dp {
 
   struct Context;
-  struct TrianglesDPImpl;
 
-  /*! allows for referencing a specific primitive within a specific
-      geometry within multiple geometries that a group may be built
-      over */
-  struct PrimRef {
-    int geomID;
-    int primID;
-  };
-  
+  /*! abstract base class of a group of one or more intersectable
+      things that share an acceleration structures. will be subclassed
+      into InstancesGroup and TrianglesGroup, and then implemented in
+      each backend based on how this backend works */
   struct Group {
-  };
-  
-  struct TrianglesDPGroup : public Group {
-    TrianglesDPGroup(Context *context,
-                     const std::vector<TrianglesDP *> &geoms);
-
-    std::vector<TrianglesDP *> geoms;
+    Group(Context *const context);
+    virtual ~Group() = default;
+    
+    /*! context that this group was created in */
     Context *const context;
-    std::shared_ptr<TrianglesDPImpl> impl;
   };
 
 } // ::dp
diff --git a/dp/Triangles.cpp b/dp/Triangles.cpp
new file mode 100644
index 0000000..a5733ed
--- /dev/null
+++ b/dp/Triangles.cpp
@@ -0,0 +1,32 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dp/Triangles.h"
+#include "dp/Context.h"
+
+namespace dp {
+  TriangleMesh::TriangleMesh(Context         *context,
+                             uint64_t         userData,
+                             const vec3d     *_vertexArray,
+                             int              vertexCount,
+                             const vec3i     *_indexArray,
+                             int              indexCount)
+    : userData(userData),
+      context(context)
+  {
+    /* iw - note this class will NOT store any pointers to host data,
+       it's the job of the derived class(es) to sture data as, if, and
+       where required*/
+  }
+  
+  TrianglesGroup::TrianglesGroup(Context *context,
+                                 const std::vector<TriangleMesh *> &geoms)
+    : Group(context)
+  {
+    /* iw - note this class will NOT store any pointers to host data,
+       it's the job of the derived class(es) to sture data as, if, and
+       where required*/
+  }
+  
+} // ::dp
+
diff --git a/dp/Triangles.cu b/dp/Triangles.cu
deleted file mode 100644
index 4781a01..0000000
--- a/dp/Triangles.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "dp/Triangles.h"
-#include "dp/Context.h"
-
-namespace dp {
-  TrianglesDP::TrianglesDP(Context         *context,
-                           uint64_t         userData,
-                           const vec3d     *_vertexArray,
-                           int              vertexCount,
-                           const vec3i     *_indexArray,
-                           int              indexCount)
-    : userData(userData),
-      vertexCount(vertexCount),
-      indexCount(indexCount),
-      context(context)
-  {
-    cudaMalloc((void**)&vertexArray,vertexCount*sizeof(vec3d));
-    cudaMemcpy((void*)vertexArray,_vertexArray,
-               vertexCount*sizeof(vec3d),cudaMemcpyDefault);
-    cudaMalloc((void**)&indexArray,indexCount*sizeof(vec3i));
-    cudaMemcpy((void*)indexArray,_indexArray,
-               indexCount*sizeof(vec3i),cudaMemcpyDefault);
-  }
-
-  TrianglesDP::~TrianglesDP()
-  {
-    cudaFree((void*)indexArray);
-    cudaFree((void*)vertexArray);
-  }
-  
-} // ::dp
-
diff --git a/dp/Triangles.h b/dp/Triangles.h
index 052148d..b66bbfa 100644
--- a/dp/Triangles.h
+++ b/dp/Triangles.h
@@ -4,29 +4,50 @@
 #pragma once
 
 #include "dp/common.h"
+#include "dp/Group.h"
 
 namespace dp {
 
   struct Context;
 
-  /*! a mesh of triangles, for a dp context, with vertices in doubles. This calss will make a *copy* of the input arrays */
-  struct TrianglesDP {
-    TrianglesDP(Context         *context,
-                uint64_t         userData,
-                const vec3d     *vertexArray,
-                int              vertexCount,
-                const vec3i     *indexArray,
-                int              indexCount);
-    virtual ~TrianglesDP();
-    
+  /*! (virtual base class for) a mesh of triangles, for a dp context,
+      with vertices in doubles. implementations of this class may
+      create copies of the input arays on either host and/or device */
+  struct TriangleMesh {
+    TriangleMesh(Context         *context,
+                 uint64_t         userData,
+                 const vec3d     *vertexArray,
+                 int              vertexCount,
+                 const vec3i     *indexArray,
+                 int              indexCount);
+    virtual ~TriangleMesh() = default;
+
+    /* iw - note this base class will NOT store any pointers to host
+       data, it's the job of the derived class(es) to sture data as,
+       if, and where required*/
     uint64_t     const userData      = 0;
-    const vec3d *const vertexArray   = 0;
-    const vec3i *const indexArray    = 0;
-    int          const vertexCount   = 0;
-    int          const indexCount    = 0;
     Context     *const context;
   };
 
+  /*! allows for referencing a specific primitive within a specific
+      geometry within multiple geometries that a group may be built
+      over */
+  struct PrimRef {
+    int geomID;
+    int primID;
+  };
+
+  /*! a "group" of one or more triangle meshes, including the
+      acceleration structure to trace a ray against those triangles */
+  struct TrianglesGroup : public Group {
+    TrianglesGroup(Context *context,
+                   const std::vector<TriangleMesh *> &geoms);
+    
+    /* iw - note this base class will NOT store any pointers to host
+       data, it's the job of the derived class(es) to sture data as,
+       if, and where required*/
+  };
+
 } // ::dp
 
 
diff --git a/dp/World.cpp b/dp/World.cpp
new file mode 100644
index 0000000..238e202
--- /dev/null
+++ b/dp/World.cpp
@@ -0,0 +1,10 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dp/World.h"
+#include "dp/Group.h"
+#include "dp/Context.h"
+
+namespace dp {
+} // ::dp
+
diff --git a/dp/World.cu b/dp/World.cu
deleted file mode 100644
index 7b976f2..0000000
--- a/dp/World.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-#include "dp/World.h"
-#include "dp/Group.h"
-#include "dp/Context.h"
-
-namespace dp {
-
-  InstancesDPGroup::InstancesDPGroup(Context *context,
-                                     const std::vector<Group *> &groups,
-                                     const DPRAffine            *d_transforms)
-    : context(context),
-      groups(groups),
-      d_transforms(d_transforms)
-  {
-    impl = context->backend->createInstancesDPImpl(this);
-  }
-  
-  void InstancesDPGroup::traceRays(DPRRay *d_rays, DPRHit *d_hits, int numRays)
-  {
-    CUBQL_CUDA_SYNC_CHECK();
-    if (!isDevicePointer(d_rays) ||
-        !isDevicePointer(d_hits))
-      throw std::runtime_error("the rays[] and hits[] arrays passed to dpTrace (currently?) have to point to device memory.");
-    impl->trace((Ray*)d_rays,
-                (Hit*)d_hits,
-                numRays);
-  }
-
-} // ::dp
-
diff --git a/dp/World.h b/dp/World.h
index f46a1df..d173cb3 100644
--- a/dp/World.h
+++ b/dp/World.h
@@ -1,28 +1,34 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
-#include "dp/Triangles.h"
+#include "dp/Group.h"
 
 namespace dp {
 
   struct Context;
-  struct Group;
-
-  struct InstancesDPImpl;
-  
-  struct InstancesDPGroup {
-    InstancesDPGroup(Context *context,
-                     const std::vector<Group *> &groups,
-                     const DPRAffine            *d_transforms);
-    
-    void traceRays(DPRRay *d_rays, DPRHit *d_hits, int numRays);
-    
-    std::vector<Group *> const groups;
-    const DPRAffine     *const d_transforms;
-    Context             *const context;
-    std::shared_ptr<InstancesDPImpl> impl;
+  struct TrianglesGroup;
+
+  /*! a group of double precision instances; each instance is defined
+      by a affine transforms and TrianglesDPGroup that it refers to */
+  struct InstanceGroup : public Group {
+    InstanceGroup(Context *context,
+                  const std::vector<TrianglesGroup *> &groups,
+                  const DPRAffine            *transforms)
+      : Group(context)
+    {}
+
+    /*! implements dprTrace() */
+    virtual void traceRays(DPRRay *d_rays,
+                           DPRHit *d_hits,
+                           int numRays,
+                           uint64_t flags) = 0;
+
+    /* iw - note this base class will NOT store any pointers to host
+       data, it's the job of the derived class(es) to store data as,
+       if, and where required*/
   };
     
 } // ::dp
diff --git a/dp/common.h b/dp/common.h
index e2132be..c6a5eff 100644
--- a/dp/common.h
+++ b/dp/common.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "deepee/deepee.h"
+#include "deepeeRT/deepeeRT.h"
 #include "cuBQL/math/box.h"
 #include "cuBQL/math/linear.h"
 #include <cuda_runtime.h>
diff --git a/dp/cuBQL/CMakeLists.txt b/dp/cuBQL/CMakeLists.txt
index 9cc012c..70c096f 100644
--- a/dp/cuBQL/CMakeLists.txt
+++ b/dp/cuBQL/CMakeLists.txt
@@ -1,14 +1,19 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-add_library(deepeeRT_cuBQL STATIC
+add_library(deepeeRT_cuBQL OBJECT
   CuBQLBackend.h
-  CuBQLBackend.cu
+  CuBQLBackend.cu 
+  Triangles.h
+  Triangles.cu
+  InstanceGroup.h
+  InstanceGroup.cu
 )
-target_link_libraries(deepeeRT_cuBQL PUBLIC
+target_link_libraries(deepeeRT_cuBQL
+  PUBLIC
   deepeeRT_common
   cuBQL
-#  $<BUILD_INTERFACE:cuBQL_cuda_float3_static>
-  $<BUILD_INTERFACE:cuBQL_cuda_double3_static>
+  # do not link to cuBQL_cuda_double3_static; we explicitly
+  # instantiate builder(s) in CubQLBackend.cu
 )
 
diff --git a/dp/cuBQL/CuBQLBackend.cu b/dp/cuBQL/CuBQLBackend.cu
index 8db7220..3fbf7f6 100644
--- a/dp/cuBQL/CuBQLBackend.cu
+++ b/dp/cuBQL/CuBQLBackend.cu
@@ -1,305 +1,83 @@
 // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+#define CUBQL_GPU_BUILDER_IMPLEMENTATION 1
+#define CUBQL_CPU_BUILDER_IMPLEMENTATION 1
+
 #include "dp/cuBQL/CuBQLBackend.h"
-#include "dp/Context.h"
-#include "dp/Group.h"
-#include "dp/World.h"
+#include "dp/cuBQL/Triangles.h"
+#include "dp/cuBQL/InstanceGroup.h"
+
+namespace cuBQL {
+  namespace cpu {
+    template
+    void spatialMedian(BinaryBVH<double,3>   &bvh,
+                       const box_t<double,3> *boxes,
+                       uint32_t          numPrims,
+                       BuildConfig       buildConfig);
+    template
+    void freeBVH(BinaryBVH<double,3> &bvh);
+
+  }
+  namespace cuda {
+    template
+    void sahBuilder(BinaryBVH<double,3>   &bvh,
+                    const box_t<double,3> *boxes,
+                    uint32_t          numPrims,
+                    BuildConfig       buildConfig,
+                    cudaStream_t       s,
+                    GpuMemoryResource &memResource);
+
+    template
+    void free(BinaryBVH<double,3> &bvh,
+              cudaStream_t      s,
+              GpuMemoryResource& memResource);
+  }
+}
 
-#include <cuBQL/bvh.h>
-#include <cuBQL/math/Ray.h>
-#include <cuBQL/queries/triangleData/Triangle.h>
-#include <cuBQL/queries/triangleData/math/rayTriangleIntersections.h>
-#include <cuBQL/traversal/rayQueries.h>
 
 namespace dp {
   namespace cubql_cuda {
-    using namespace ::cuBQL;
-    
-    using bvh3d = bvh_t<double,3>;
-
-    using TriangleDP = cuBQL::triangle_t<double>;
-    using RayTriangleIntersection = cuBQL::RayTriangleIntersection_t<double>;
-    
-    /*! an array that can upload an array from host to device, and free
-      on destruction. If the pointer provided is *already* a device
-      pointer this will just use that pointer */
-    template<typename T>
-    struct AutoUploadArray {
-      AutoUploadArray(const T *elements, int count)
-      {
-        this->count = count;
-        if (isDevicePointer(elements)) {
-          this->elements = elements;
-          this->needsCudaFree = false;
-        } else {
-          cudaMalloc((void **)&this->elements,count*sizeof(T));
-          cudaMemcpy((void*)this->elements,elements,count*sizeof(T),
-                     cudaMemcpyDefault);
-          this->needsCudaFree = true;
-        }
-      }
-      
-      ~AutoUploadArray() { if (needsCudaFree) cudaFree((void*)elements); }
-    
-      const T *elements      = 0;
-      int  count         = 0;
-      bool needsCudaFree = false;
-    };
-
-    /*! triangle mesh representation on the device */
-    struct DevMesh {
-      const vec3d   *vertices;
-      const vec3i   *indices;
-      uint64_t userData;
-    };
-  
-    struct HostMesh {
-      HostMesh(uint64_t userData,
-               const vec3d *verticesArray, int verticesCount,
-               const vec3i *indicesArray, int indicesCount)
-        : userData(userData),
-          vertices(verticesArray,verticesCount),
-          indices(indicesArray,indicesCount)
-      {}
-    
-      DevMesh getDD() const
-      { return { vertices.elements, indices.elements, userData }; }
-    
-      AutoUploadArray<vec3d> vertices;
-      AutoUploadArray<vec3i> indices;
-      uint64_t         const userData;
-    };
-  
-    struct TrianglesDP : public dp::TrianglesDPImpl {
-      TrianglesDP(CuBQLCUDABackend *be,
-                  dp::TrianglesDPGroup *fe);
-      virtual ~TrianglesDP();
-    
-      struct DevGroup {
-        inline __cubql_both TriangleDP getTriangle(PrimRef prim) const;
-        
-        bvh3d     bvh;
-        DevMesh  *meshes;
-        PrimRef  *primRefs;
-      };
-
-      DevGroup getDevGroup() const
-      { return { bvh,meshes,primRefs }; }
-      
-      bvh3d     bvh;
-      // DevGroup *group  = nullptr;
-      DevMesh  *meshes = nullptr;
-      PrimRef  *primRefs = nullptr;
-      
-      /*! these are stored and owned on the host, and also manage their
-        vertex arrays' ownerhip; but vertex arrays themselves will be
-        device accessible */
-      std::vector<std::shared_ptr<HostMesh>> hostMeshes;
-    
-      CuBQLCUDABackend     *const be;
-    };
-
-    inline __cubql_both
-    TriangleDP TrianglesDP::DevGroup::getTriangle(PrimRef prim) const
+    CuBQLCUDABackend::CuBQLCUDABackend(int gpuID)
+      : Context(gpuID)
     {
-      DevMesh mesh = meshes[prim.geomID];
-      vec3i idx = mesh.indices[prim.primID];
-      return { mesh.vertices[idx.x],mesh.vertices[idx.y],mesh.vertices[idx.z] };
+      SetActiveGPU forDuration(gpuID);
+      cudaFree(0);
     }
-    
-    __global__
-    void generateTriangleInputs(int meshID,
-                                PrimRef *primRefs,
-                                box3d *primBounds,
-                                int numTrisThisMesh,
-                                DevMesh mesh)
-    {
-      int tid = threadIdx.x+blockIdx.x*blockDim.x;
-      if (tid >= numTrisThisMesh) return;
 
-      vec3i idx = mesh.indices[tid];
-      box3d bb;
-      bb.extend(mesh.vertices[idx.x]);
-      bb.extend(mesh.vertices[idx.y]);
-      bb.extend(mesh.vertices[idx.z]);
 
-      primRefs[tid] = { meshID, tid };
-      primBounds[tid] = bb;
-    }
-    
-    TrianglesDP::TrianglesDP(CuBQLCUDABackend *be,
-                             dp::TrianglesDPGroup *fe)
-      : TrianglesDPImpl(fe), be(be)
-    {
-      SetActiveGPU forDuration(be->gpuID);
-      
-      int numTrisTotal = 0;
-      std::vector<DevMesh> devMeshes;
-      for (auto geom : fe->geoms) {
-        numTrisTotal += geom->indexCount;
-        // this will automatically upload the vertex arrays if so required:
-        auto hm = std::make_shared<HostMesh>
-          (geom->userData,
-           geom->vertexArray,geom->vertexCount,
-           geom->indexArray,geom->indexCount);
-        hostMeshes.push_back(hm);
-        devMeshes.push_back(hm->getDD());
-      }
-      cudaMalloc((void **)&meshes,devMeshes.size()*sizeof(DevMesh));
-      cudaMemcpy((void*)meshes,devMeshes.data(),
-                 devMeshes.size()*sizeof(DevMesh),cudaMemcpyDefault);
-      
-      cudaMalloc((void **)&primRefs,numTrisTotal*sizeof(*primRefs));
-
-      box3d   *primBounds = nullptr;
-      cudaMalloc((void **)&primBounds,numTrisTotal*sizeof(*primBounds));
-
-      int offset = 0;
-      for (int meshID=0;meshID<(int)hostMeshes.size();meshID++) {
-        auto &hm = hostMeshes[meshID];
-        int count = hm->indices.count;
-        int bs = 128;
-        int nb = divRoundUp(count,bs);
-        generateTriangleInputs<<<nb,bs>>>(meshID,
-                                          primRefs+offset,
-                                          primBounds+offset,
-                                          count,
-                                          hm->getDD());
-        offset += count;
-      }
-      cudaStreamSynchronize(0);
-
-      std::cout << "#dpr: building BVH over " << prettyNumber(numTrisTotal)
-                << " triangles" << std::endl;
-      CUBQL_CUDA_SYNC_CHECK();
-      DeviceMemoryResource memResource;
-#if 0
-      ::cuBQL::gpuBuilder(bvh,
-                                primBounds,
-                                numTrisTotal,
-                                ::cuBQL::BuildConfig(),
-                                0,
-                                memResource);
-#else
-      ::cuBQL::cuda::sahBuilder(bvh,
-                                primBounds,
-                                numTrisTotal,
-                                ::cuBQL::BuildConfig(),
-                                0,
-                                memResource);
-#endif
-      std::cout << "#dpr: ... bvh built." << std::endl;
-      
-      cudaFree(primBounds);
-      CUBQL_CUDA_SYNC_CHECK();
-    }
-  
-    TrianglesDP::~TrianglesDP()
+    dp::InstanceGroup *
+    CuBQLCUDABackend::
+    createInstanceGroup(const std::vector<dp::TrianglesGroup *> &groups,
+                        const DPRAffine *transforms)
     {
-      cudaFree(meshes);
-      cudaFree(primRefs);
-      ::cuBQL::cuda::free(bvh);
+      return new InstanceGroup(this, groups,(const affine3d*)transforms);
     }
     
-    __global__ void g_trace(TrianglesDP::DevGroup group,
-                            Ray *rays,
-                            Hit *hits,
-                            int numRays)
+    dp::TriangleMesh *
+    CuBQLCUDABackend::
+    createTriangleMesh(uint64_t         userData,
+                       const vec3d     *vertexArray,
+                       int              vertexCount,
+                       const vec3i     *indexArray,
+                       int              indexCount) 
     {
-      int tid = threadIdx.x+blockIdx.x*blockDim.x;
-      if (tid >= numRays) return;
-
-#ifdef NDEBUG
-      const bool dbg = false;
-#else
-      bool dbg = (tid == -1);
-#endif
-
-      
-      Hit hit = hits[tid];
-      hit.primID = -1;
-      int instID = 0;
-      ::cuBQL::ray3d ray(rays[tid].origin,
-                         rays[tid].direction,
-                         rays[tid].tMin,
-                         rays[tid].tMax);
-
-      if (dbg) {
-        cuBQL::dout << "dbg ray " << ray << "\n";
-        cuBQL::dout << "bvh.nodes " << (int*)group.bvh.nodes << "\n";
-        cuBQL::dout << "bvh.primIDs " << (int*)group.bvh.primIDs << "\n";
-        cuBQL::dout << "group.meshes " << (int*)group.meshes << "\n";
-        cuBQL::dout << "group.mesh0 " << group.meshes[0].userData << "\n";
-        cuBQL::dout << "group.primRefs " << (int*)group.primRefs << "\n";
-      }
-      auto intersectPrim = [&ray,&hit,group,instID,dbg](uint32_t primID) -> double {
-        RayTriangleIntersection isec;
-        PrimRef prim = group.primRefs[primID];
-        const TriangleDP tri = group.getTriangle(prim);
-        if (isec.compute(ray,tri)) {
-          hit.primID = prim.primID;
-          hit.instID = instID;
-          hit.geomUserData = group.meshes[prim.geomID].userData;
-          hit.t = isec.t;
-          if (dbg) printf("hit %i %i\n",hit.instID,hit.primID);
-          ray.tMax = isec.t;
-        }
-        return ray.tMax;
-      };
-      ::cuBQL::shrinkingRayQuery::forEachPrim(intersectPrim,group.bvh,ray);
-      hits[tid] = hit;;
+      return new TriangleMesh(this,
+                              userData,
+                              vertexArray,
+                              vertexCount,
+                              indexArray,
+                              indexCount);
     }
-
-
-    struct InstancesDP : public dp::InstancesDPImpl {
-      InstancesDP(CuBQLCUDABackend *be,
-                  dp::InstancesDPGroup *fe)
-        : InstancesDPImpl(fe), be(be)
-      {}
-      void trace(Ray *rays,
-                 Hit *hits,
-                 int numRays) override;
-      CuBQLCUDABackend *const be;
-    };
-
-    void InstancesDP::trace(Ray *rays,
-                            Hit *hits,
-                            int numRays) 
+    
+    dp::TrianglesGroup *
+    CuBQLCUDABackend::
+    createTrianglesGroup(const std::vector<dp::TriangleMesh *> &geoms)
     {
-      CUBQL_CUDA_SYNC_CHECK();
-      assert(fe->groups.size() == 1);
-      assert(fe->d_transforms == nullptr);
-      TrianglesDPGroup *tg = (TrianglesDPGroup *)fe->groups[0];
-      assert(tg);
-      TrianglesDP *triangles = (TrianglesDP*)tg->impl.get();
-      assert(triangles);
-
-      int bs = 128;
-      int nb = divRoundUp(numRays,bs);
-      g_trace<<<nb,bs>>>(triangles->getDevGroup(),
-                         rays,hits,
-                         numRays);
-      CUBQL_CUDA_SYNC_CHECK();
+      return new TrianglesGroup(this,geoms);
     }
-    
   } // :: cubql_cuda
   
-  CuBQLCUDABackend::CuBQLCUDABackend(Context *const context)
-    : Backend(context)
-  {
-    SetActiveGPU forDuration(context->gpuID);
-    cudaFree(0);
-  }
-
-  std::shared_ptr<InstancesDPImpl>
-  CuBQLCUDABackend::createInstancesDPImpl(dp::InstancesDPGroup *fe)
-  { return std::make_shared<cubql_cuda::InstancesDP>(this,fe); }
-    
-  std::shared_ptr<TrianglesDPImpl>
-  CuBQLCUDABackend::createTrianglesDPImpl(dp::TrianglesDPGroup *fe) 
-  { return std::make_shared<cubql_cuda::TrianglesDP>(this,fe); }
-  
 }
 
   
diff --git a/dp/cuBQL/CuBQLBackend.h b/dp/cuBQL/CuBQLBackend.h
index 962a43a..7b2eda4 100644
--- a/dp/cuBQL/CuBQLBackend.h
+++ b/dp/cuBQL/CuBQLBackend.h
@@ -3,24 +3,103 @@
 
 #pragma once
 
-#include "dp/Backend.h"
-#include "dp/Group.h"
+#include "dp/Context.h"
 #include <cuBQL/bvh.h>
+#include <cuBQL/bvh.h>
+#include <cuBQL/math/common.h>
+#include <cuBQL/math/Ray.h>
+#include <cuBQL/math/affine.h>
+#include <cuBQL/queries/triangleData/Triangle.h>
+#include <cuBQL/queries/triangleData/math/rayTriangleIntersections.h>
+#include <cuBQL/traversal/rayQueries.h>
 
 namespace dp {
-  
-  struct CuBQLCUDABackend : public dp::Backend
-  {
-    CuBQLCUDABackend(Context *const context);
-    virtual ~CuBQLCUDABackend() = default;
+  namespace cubql_cuda {
+    
+    using namespace ::cuBQL;
     
-    virtual std::shared_ptr<InstancesDPImpl>
-    createInstancesDPImpl(dp::InstancesDPGroup *fe) override;
+    using bvh3d = bvh_t<double,3>;
+    using TriangleDP = cuBQL::triangle_t<double>;
+    using RayTriangleIntersection = cuBQL::RayTriangleIntersection_t<double>;
+    using cuBQL::affine3d;
+    
+    /*! an array that can upload an array from host to device, and free
+      on destruction. If the pointer provided is *already* a device
+      pointer this will just use that pointer */
+    template<typename T>
+    struct AutoUploadArray {
+      AutoUploadArray() = default;
+      AutoUploadArray(const T *elements, size_t count);
+      AutoUploadArray(const AutoUploadArray &other) = delete;
+      ~AutoUploadArray();
+
+      // move operator
+      AutoUploadArray &operator=(AutoUploadArray &&other);
+      const T *elements      = 0;
+      size_t   count         = 0;
+      bool     needsCudaFree = false;
+    };
+  
+    struct CuBQLCUDABackend : public dp::Context
+    {
+      CuBQLCUDABackend(int gpuID);
+      virtual ~CuBQLCUDABackend() = default;
+
+      dp::InstanceGroup *
+      createInstanceGroup(const std::vector<dp::TrianglesGroup *> &groups,
+                          const DPRAffine *transforms) override;
     
-    virtual std::shared_ptr<TrianglesDPImpl>
-    createTrianglesDPImpl(dp::TrianglesDPGroup *fe) override;
-  };
+      dp::TriangleMesh *
+      createTriangleMesh(uint64_t         userData,
+                         const vec3d     *vertexArray,
+                         int              vertexCount,
+                         const vec3i     *indexArray,
+                         int              indexCount) override;
+      
+      dp::TrianglesGroup *
+      createTrianglesGroup(const std::vector<dp::TriangleMesh *> &geoms) override;
+    };
+
+#ifdef __CUDACC__
+    // ==================================================================
+    // INLINE IMPLEMENTATION SECTION
+    // ==================================================================
+    template<typename T> inline
+    AutoUploadArray<T>::AutoUploadArray(const T *elements,
+                                        size_t count)
+    {
+      this->count = count;
+      // if (isDevicePointer(elements)) {
+      //   this->elements = elements;
+      //   this->needsCudaFree = false;
+      // } else {
+      // iw - for now, ALWAYS create a copy
+      CUBQL_CUDA_SYNC_CHECK();
+      cudaMalloc((void **)&this->elements,count*sizeof(T));
+      cudaMemcpy((void*)this->elements,elements,count*sizeof(T),
+                 cudaMemcpyDefault);
+      this->needsCudaFree = true;
+      CUBQL_CUDA_SYNC_CHECK();
+    }
 
+    template<typename T> inline
+    AutoUploadArray<T> &
+    AutoUploadArray<T>::operator=(AutoUploadArray &&other)
+    {
+      elements = other.elements; other.elements = 0;
+      count = other.count; other.count = 0;
+      needsCudaFree = other.needsCudaFree; other.needsCudaFree = 0;
+      return *this;
+    }
+    
+    template<typename T> inline
+    AutoUploadArray<T>::~AutoUploadArray() {
+      if (needsCudaFree) cudaFree((void*)elements);
+      CUBQL_CUDA_SYNC_CHECK();
+    }
+#endif
+    
+  }
 }
 
 
diff --git a/dp/cuBQL/Group.cu b/dp/cuBQL/Group.cu
new file mode 100644
index 0000000..f32d571
--- /dev/null
+++ b/dp/cuBQL/Group.cu
@@ -0,0 +1,212 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dp/cuBQL/CuBQLBackend.h"
+
+namespace dp {
+  namespace cubql_cuda {
+    __global__
+    void g_traceFirstGroupOnly(TrianglesDP::DevGroup group,
+                               Ray *rays,
+                               Hit *hits,
+                               int numRays)
+    {
+      int tid = threadIdx.x+blockIdx.x*blockDim.x;
+      if (tid >= numRays) return;
+
+#ifdef NDEBUG
+      const bool dbg = false;
+#else
+      bool dbg = (tid == -1);
+#endif
+      
+      Hit hit = hits[tid];
+      hit.primID = -1;
+      int instID = 0;
+      ::cuBQL::ray3d ray(rays[tid].origin,
+                         rays[tid].direction,
+                         rays[tid].tMin,
+                         rays[tid].tMax);
+
+      if (dbg) {
+        cuBQL::dout << "dbg ray " << ray << "\n";
+        cuBQL::dout << "bvh.nodes " << (int*)group.bvh.nodes << "\n";
+        cuBQL::dout << "bvh.primIDs " << (int*)group.bvh.primIDs << "\n";
+        cuBQL::dout << "group.meshes " << (int*)group.meshes << "\n";
+        cuBQL::dout << "group.mesh0 " << group.meshes[0].userData << "\n";
+        cuBQL::dout << "group.primRefs " << (int*)group.primRefs << "\n";
+      }
+      auto intersectPrim = [&ray,&hit,group,instID,dbg](uint32_t primID)
+        -> double
+      {
+        RayTriangleIntersection isec;
+        PrimRef prim = group.primRefs[primID];
+        const TriangleDP tri = group.getTriangle(prim);
+        if (isec.compute(ray,tri)) {
+          hit.primID = prim.primID;
+          hit.instID = instID;
+          hit.geomUserData = group.meshes[prim.geomID].userData;
+          hit.t = isec.t;
+          if (dbg) printf("hit %i %i\n",hit.instID,hit.primID);
+          ray.tMax = isec.t;
+        }
+        return ray.tMax;
+      };
+      ::cuBQL::shrinkingRayQuery::forEachPrim(intersectPrim,group.bvh,ray);
+      hits[tid] = hit;;
+    }
+
+
+
+
+
+    __global__
+    void g_traceWorld(/*! the device data for the instancegroup itself */
+                      InstanceGroup::DD model,
+                      /*! the list of instance transforms */
+                      const DPRAffine     *const d_transforms,
+                      /*! the list of instantiated groups */
+                      const TrianglesDP::DevGroup *d_instantiatedGroups,
+                      Ray *rays,
+                      Hit *hits,
+                      int numRays)
+    {
+      int tid = threadIdx.x+blockIdx.x*blockDim.x;
+      if (tid >= numRays) return;
+
+#ifdef NDEBUG
+      const bool dbg = false;
+#else
+      bool dbg = (tid == -1);
+#endif
+
+      Hit hit = hits[tid];
+      hit.primID = -1;
+      hit.instID = -1;
+      struct {
+        int instID = -1;
+        TrianglesDP::Group group;
+        ::cuBQL::ray3d ray;
+      } object;
+      ::cuBQL::ray3d worldRay(rays[tid].origin,
+                              rays[tid].direction,
+                              rays[tid].tMin,
+                              rays[tid].tMax);
+      
+      auto intersectPrim = [&ray,&hit,&object,instID,dbg](uint32_t primID)
+        -> double
+      {
+        RayTriangleIntersection isec;
+        PrimRef prim = object.group.primRefs[primID];
+        const TriangleDP tri = group.getTriangle(prim);
+        if (isec.compute(ray,tri)) {
+          hit.primID = prim.primID;
+          hit.instID = object.instID;
+          hit.geomUserData = group.meshes[prim.geomID].userData;
+          hit.t = isec.t;
+          if (dbg) printf("hit %i %i\n",hit.instID,hit.primID);
+          ray.tMax = isec.t;
+        }
+        return ray.tMax;
+      };
+      auto enterBlas = [this,model,&current]
+        (cuBQL::ray3f &out_ray,
+         cuBQL::bvh3f &out_bvh,
+         int instID) 
+      {
+        current.group  = d_instantiatedGroups[instID];
+        current.instID = instID;
+        object.ray = world.ray;
+        if (!isUnitTransform(currentInstance->worldToObjectXfm)) {
+          object.ray.origin
+            = xfmPoint(currentInstance->worldToObjectXfm,world.origin);
+          object.ray.direction
+            = xfmVector(currentInstance->worldToObjectXfm,world.direction);
+        }
+        out_bvh = {0,0,0,0};
+        out_bvh.nodes = object.group.bvhNodes;
+      };
+      auto leaveBlas = [this]() -> void {
+        /* nothing to do */
+      };
+      
+      ::cuBQL::shrinkingRayQuery::twoLevel::forEachPrim
+          (enterBlas,leaveBlas,intersectPrim,model->bvh,ray);
+      
+      hits[tid] = hit;
+    }
+
+    
+    struct InstancesDP : public dp::InstancesDPImpl {
+      InstancesDP(CuBQLCUDABackend *be,
+                  dp::InstancesDPGroup *fe)
+        : InstancesDPImpl(fe), be(be)
+      {
+        int numInstances = fe->instances.size();
+        if (numInstances == 0) return;
+
+        std::vector<TrianglesDP::DevGroup> instancedGroups;
+        for (auto feGroup : fe->groups) {
+          dp::TrianglesDPGroup *group = feGroup
+          instancedGroups.push_back(inst
+                                    }
+        cudaMalloc((void **)&d_instancedGroups,
+                   numInstances*sizeof(*d_triangleGroups));
+        cudaMemcpy((void*)d_instancedGroups,instancedGroups.data(),
+                   numInstances*sizeof(*d_triangleGroups),
+                   cudaMemcpyDefault);
+      }
+      void trace(Ray *rays,
+                 Hit *hits,
+                 int numRays) override;
+
+      TrianglesDP::DevGroup *d_instancedGroups = 0;
+      CuBQLCUDABackend *const be;
+    };
+
+    void InstancesDP::trace(Ray *rays,
+                            Hit *hits,
+                            int numRays) 
+    {
+      CUBQL_CUDA_SYNC_CHECK();
+      int bs = 128;
+      int nb = divRoundUp(numRays,bs);
+#if 0
+      assert(fe->groups.size() == 1);
+      assert(fe->d_transforms == nullptr);
+      TrianglesDPGroup *tg = (TrianglesDPGroup *)fe->groups[0];
+      assert(tg);
+      TrianglesDP *triangles = (TrianglesDP*)tg->impl.get();
+      assert(triangles);
+
+      g_traceFirstGroupOnly<<<nb,bs>>>(triangles->getDevGroup(),
+                                       rays,hits,
+                                       numRays);
+#else
+      g_traceInstances<<<nb,bs>>>(triangles->getDevGroup(),
+                         rays,hits,
+                         numRays);
+#endif
+      CUBQL_CUDA_SYNC_CHECK();
+    }
+    
+  } // :: cubql_cuda
+  
+  CuBQLCUDABackend::CuBQLCUDABackend(Context *const context)
+    : Backend(context)
+  {
+    SetActiveGPU forDuration(context->gpuID);
+    cudaFree(0);
+  }
+
+  std::shared_ptr<InstancesDPImpl>
+  CuBQLCUDABackend::createInstancesDPImpl(dp::InstancesDPGroup *fe)
+  { return std::make_shared<cubql_cuda::InstancesDP>(this,fe); }
+    
+  std::shared_ptr<TrianglesDPImpl>
+  CuBQLCUDABackend::createTrianglesDPImpl(dp::TrianglesDPGroup *fe) 
+  { return std::make_shared<cubql_cuda::TrianglesDP>(this,fe); }
+  
+}
+
+  
diff --git a/dp/cuBQL/InstanceGroup.cu b/dp/cuBQL/InstanceGroup.cu
new file mode 100644
index 0000000..4405f4a
--- /dev/null
+++ b/dp/cuBQL/InstanceGroup.cu
@@ -0,0 +1,223 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dp/cuBQL/InstanceGroup.h"
+#include "dp/cuBQL/Triangles.h"
+
+namespace dp {
+  namespace cubql_cuda {
+
+    __global__
+    void g_prepareInstances(int numInstances,
+                            InstanceGroup::InstancedObjectDD *instances,
+                            bool hasTransforms,
+                            affine3d *worldToObjectXfms,
+                            affine3d *objectToWorldXfms,
+                            box3d *d_instBounds)
+    {
+      int tid = threadIdx.x+blockIdx.x*blockDim.x;
+      if (tid >= numInstances) return;
+      affine3d xfm;
+      if (!hasTransforms) {
+        xfm = affine3d();
+        objectToWorldXfms[tid] = xfm;
+      } else {
+        xfm = objectToWorldXfms[tid];
+      }
+      worldToObjectXfms[tid] = rcp(xfm);
+      instances[tid].hasXfm = (xfm != affine3d());
+
+      box3d objBounds = instances[tid].group.bvh.nodes[0].bounds;
+      vec3d b0 = objBounds.lower;
+      vec3d b1 = objBounds.upper;
+      box3d instBounds;
+      instBounds.extend(xfmPoint(xfm,vec3d(b0.x,b0.y,b0.z)));
+      instBounds.extend(xfmPoint(xfm,vec3d(b0.x,b0.y,b1.z)));
+      instBounds.extend(xfmPoint(xfm,vec3d(b0.x,b1.y,b0.z)));
+      instBounds.extend(xfmPoint(xfm,vec3d(b0.x,b1.y,b1.z)));
+      instBounds.extend(xfmPoint(xfm,vec3d(b1.x,b0.y,b0.z)));
+      instBounds.extend(xfmPoint(xfm,vec3d(b1.x,b0.y,b1.z)));
+      instBounds.extend(xfmPoint(xfm,vec3d(b1.x,b1.y,b0.z)));
+      instBounds.extend(xfmPoint(xfm,vec3d(b1.x,b1.y,b1.z)));
+      d_instBounds[tid] = instBounds;
+    }
+    
+    InstanceGroup::InstanceGroup(Context *context,
+                                 const std::vector<dp::TrianglesGroup *> &groups,
+                                 const affine3d *transforms)
+      : dp::InstanceGroup(context,groups,
+                          (const DPRAffine *)transforms),
+        numInstances((int)groups.size())
+    {
+      CUBQL_CUDA_SYNC_CHECK();
+      assert(numInstances > 0);
+      std::vector<InstancedObjectDD> instanceDDs;
+      for (auto _group : groups) {
+        InstancedObjectDD instance;
+        TrianglesGroup *group = (TrianglesGroup*)_group;
+        instance.group = group->getDD();
+        instanceDDs.push_back(instance);
+      }
+
+      cudaMalloc((void**)&d_instanceDDs,
+                 numInstances*sizeof(*d_instanceDDs));
+      cudaMemcpy(d_instanceDDs,
+                 instanceDDs.data(),
+                 numInstances*sizeof(*d_instanceDDs),
+                 cudaMemcpyDefault);
+      
+      cudaMalloc((void**)&d_worldToObjectXfms,
+                 numInstances*sizeof(affine3d));
+      cudaMalloc((void**)&d_objectToWorldXfms,
+                 numInstances*sizeof(affine3d));
+      if (transforms)
+        cudaMemcpy(d_objectToWorldXfms,
+                   transforms,
+                   numInstances*sizeof(affine3d),
+                   cudaMemcpyDefault);
+      box3d *d_instBounds = 0;
+      cudaMalloc((void**)&d_instBounds,
+                 numInstances*sizeof(box3d));
+      g_prepareInstances
+        <<<divRoundUp(numInstances,128),128>>>
+        (numInstances,
+         d_instanceDDs,
+         transforms != 0,
+         d_worldToObjectXfms,
+         d_objectToWorldXfms,
+         d_instBounds);
+      CUBQL_CUDA_SYNC_CHECK();
+
+      DeviceMemoryResource memResource;
+      ::cuBQL::BuildConfig buildConfig;
+      buildConfig.maxAllowedLeafSize = 1;
+      std::cout << "==================================================================" << std::endl;
+      PING;
+      std::cout << "TOP" << std::endl;
+      ::cuBQL::cuda::sahBuilder(bvh,
+                                d_instBounds,
+                                numInstances,
+                                buildConfig,
+                                0,
+                                memResource);
+      
+      CUBQL_CUDA_SYNC_CHECK();
+      PING;
+      cudaFree(d_instBounds);
+    }
+
+    InstanceGroup::~InstanceGroup()
+    {
+      cudaFree(d_instanceDDs);
+      cudaFree(d_objectToWorldXfms);
+      cudaFree(d_worldToObjectXfms);
+    }
+    
+    InstanceGroup::DD InstanceGroup::getDD() const
+    {
+      return { d_instanceDDs, d_worldToObjectXfms, bvh };
+    }
+
+    __global__
+    void g_traceRays(/*! the device data for the instancegroup itself */
+                     InstanceGroup::DD world,
+                     DPRRay *rays,
+                     DPRHit *hits,
+                     int numRays,
+                     uint64_t flags)
+    {
+      int tid = threadIdx.x+blockIdx.x*blockDim.x;
+      if (tid >= numRays) return;
+
+#ifdef NDEBUG
+      const bool dbg = false;
+#else
+      bool dbg = false;//(tid == 512*1024+512);
+#endif
+
+      DPRHit hit = hits[tid];
+      hit.primID = -1;
+      hit.instID = -1;
+      hit.t = 1e30;
+      struct ObjectSpaceTravState {
+        int instID = -1;
+        InstanceGroup::InstancedObjectDD instance;
+        ::cuBQL::ray3d ray;
+      } objectSpace;
+      ::cuBQL::ray3d worldRay((const vec3d&)rays[tid].origin,
+                              (const vec3d&)rays[tid].direction,
+                              rays[tid].tMin,
+                              rays[tid].tMax);
+      
+      auto intersectPrim
+        = [&hit,&worldRay,&objectSpace,flags,dbg](uint32_t primID)
+        -> double
+      {
+        RayTriangleIntersection isec;
+        auto &group = objectSpace.instance.group;
+        PrimRef prim = group.primRefs[primID];
+        const TriangleDP tri = group.getTriangle(prim);
+
+        auto getNormal = [tri]() { return cross(tri.b-tri.a,tri.c-tri.a); };
+        bool culled = false;
+        if (flags & DPR_CULL_FRONT)
+          culled |= (dot(getNormal(),objectSpace.ray.direction) <= 0.);
+        if (flags & DPR_CULL_BACK)
+          culled |= (dot(getNormal(),objectSpace.ray.direction) >= 0.);
+        if (!culled && isec.compute(objectSpace.ray,tri)) {
+          hit.primID = prim.primID;
+          hit.instID = objectSpace.instID;
+          hit.geomUserData = group.meshes[prim.geomID].userData;
+          hit.t = isec.t;
+          worldRay.tMax = isec.t;
+        }
+        return worldRay.tMax;
+      };
+      auto enterBlas = [world,worldRay,&objectSpace,dbg]
+        (cuBQL::ray3d &out_ray,
+         cuBQL::bvh3d &out_bvh,
+         int instID) 
+      {
+        objectSpace.instance = world.instancedGroups[instID];
+        objectSpace.instID = instID;
+        objectSpace.ray = worldRay;
+        if (objectSpace.instance.hasXfm) {
+          affine3d worldToObjectXfm = world.worldToObjectXfms[instID];
+          objectSpace.ray.origin
+            = xfmPoint(worldToObjectXfm,worldRay.origin);
+          objectSpace.ray.direction
+            = xfmVector(worldToObjectXfm,worldRay.direction);
+        }
+        out_ray = objectSpace.ray;
+        if (dbg) dout << "out ray " << out_ray << "\n";
+        out_bvh = objectSpace.instance.group.bvh;
+        // out_bvh.nodes = objectSpace.instance.group.bvh.nodes;
+      };
+      auto leaveBlas = []() -> void {
+        /* nothing to do */
+      };
+      
+      ::cuBQL::shrinkingRayQuery::twoLevel::forEachPrim
+          (enterBlas,leaveBlas,intersectPrim,world.bvh,worldRay,dbg);
+      
+      hits[tid] = hit;
+    }
+
+    
+    void InstanceGroup::traceRays(DPRRay *d_rays,
+                                  DPRHit *d_hits,
+                                  int numRays,
+                                  uint64_t flags)
+    {
+      int bs = 128;
+      int nb = divRoundUp(numRays,bs);
+      g_traceRays<<<nb,bs>>>(getDD(),
+                             d_rays,d_hits,numRays,
+                             flags);
+      cudaDeviceSynchronize();
+    }
+      
+  }
+}
+
diff --git a/dp/cuBQL/InstanceGroup.h b/dp/cuBQL/InstanceGroup.h
new file mode 100644
index 0000000..e4c1225
--- /dev/null
+++ b/dp/cuBQL/InstanceGroup.h
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "dp/cuBQL/CuBQLBackend.h"
+#include "dp/cuBQL/Triangles.h"
+#include "dp/World.h"
+
+namespace dp {
+  namespace cubql_cuda {
+
+    /*! a single triangle mesh; can be created over pointes that are
+        either on host or device, but which definitively stores
+        vertices on the device */
+    struct InstanceGroup : public dp::InstanceGroup {
+      struct InstancedObjectDD {
+        TrianglesGroup::DD group;
+        bool hasXfm;
+      };
+      struct DD {
+        const InstancedObjectDD *instancedGroups;
+        const affine3d          *worldToObjectXfms;
+        bvh3d bvh;
+      };
+
+      InstanceGroup(Context *context,
+                    const std::vector<dp::TrianglesGroup *> &groups,
+                    const affine3d *transforms);
+      ~InstanceGroup() override;
+      
+      DD getDD() const;
+
+      void traceRays(DPRRay *d_rays,
+                     DPRHit *d_hits,
+                     int numRays,
+                     uint64_t flags) override;
+
+      int                numInstances = 0;
+      InstancedObjectDD *d_instanceDDs = 0;
+      affine3d          *d_worldToObjectXfms = 0;
+      affine3d          *d_objectToWorldXfms = 0;
+      bvh3d bvh;
+    };
+    
+  }
+}
diff --git a/dp/cuBQL/Triangles.cu b/dp/cuBQL/Triangles.cu
new file mode 100644
index 0000000..998a850
--- /dev/null
+++ b/dp/cuBQL/Triangles.cu
@@ -0,0 +1,119 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dp/cuBQL/Triangles.h"
+
+namespace dp {
+  namespace cubql_cuda {
+
+    /*! triangle mesh representation on the device */
+    __global__
+    void generateTriangleInputs(int      meshID,
+                                PrimRef *primRefs,
+                                box3d   *primBounds,
+                                int      numTrisThisMesh,
+                                TriangleMesh::DD mesh)
+    {
+      int tid = threadIdx.x+blockIdx.x*blockDim.x;
+      if (tid >= numTrisThisMesh) return;
+
+      vec3i idx = mesh.indices[tid];
+      box3d bb;
+      bb.extend(mesh.vertices[idx.x]);
+      bb.extend(mesh.vertices[idx.y]);
+      bb.extend(mesh.vertices[idx.z]);
+
+      primRefs[tid] = { meshID, tid };
+      primBounds[tid] = bb;
+    }
+    
+    TriangleMesh::TriangleMesh(Context         *context,
+                               uint64_t         userData,
+                               const vec3d     *vertexArray,
+                               int              vertexCount,
+                               const vec3i     *indexArray,
+                               int              indexCount)
+      : dp::TriangleMesh(context,userData,
+                         vertexArray,vertexCount,
+                         indexArray,indexCount),
+        vertices(vertexArray,vertexCount),
+        indices(indexArray,indexCount)
+    {}
+    
+    TrianglesGroup::TrianglesGroup(Context *context,
+                                   const std::vector<dp::TriangleMesh *> &meshes)
+      : dp::TrianglesGroup(context,meshes)
+    {
+      CUBQL_CUDA_SYNC_CHECK();
+      SetActiveGPU forDuration(context->gpuID);
+      
+      int numTrisTotal = 0;
+      std::vector<TriangleMesh::DD> devMeshes;
+      for (auto _geom : meshes) {
+        TriangleMesh *geom = (TriangleMesh*)_geom;
+        devMeshes.push_back(geom->getDD());
+        numTrisTotal += geom->indices.count;
+      }
+      cudaMalloc((void **)&d_meshDDs,
+                 devMeshes.size()*sizeof(TriangleMesh::DD));
+      cudaMemcpy((void*)d_meshDDs,devMeshes.data(),
+                 devMeshes.size()*sizeof(TriangleMesh::DD),cudaMemcpyDefault);
+      
+      cudaMalloc((void **)&d_primRefs,numTrisTotal*sizeof(*d_primRefs));
+
+      box3d   *d_primBounds = nullptr;
+      cudaMalloc((void **)&d_primBounds,numTrisTotal*sizeof(*d_primBounds));
+      
+      int offset = 0;
+      for (int meshID=0;meshID<(int)meshes.size();meshID++) {
+        TriangleMesh *mesh = (TriangleMesh *)meshes[meshID];
+        int count = mesh->indices.count;
+        int bs = 128;
+        int nb = divRoundUp(count,bs);
+        generateTriangleInputs<<<nb,bs>>>(meshID,
+                                          d_primRefs+offset,
+                                          d_primBounds+offset,
+                                          count,
+                                          mesh->getDD());
+        offset += count;
+      }
+      cudaStreamSynchronize(0);
+      
+      // std::cout << "#dpr: building BVH over " << prettyNumber(numTrisTotal)
+      //           << " triangles" << std::endl;
+      CUBQL_CUDA_SYNC_CHECK();
+      DeviceMemoryResource memResource;
+#if 0
+      ::cuBQL::gpuBuilder(bvh,
+                          d_primBounds,
+                          numTrisTotal,
+                          ::cuBQL::BuildConfig(),
+                          0,
+                          memResource);
+#else
+      ::cuBQL::cuda::sahBuilder(bvh,
+                                d_primBounds,
+                                numTrisTotal,
+                                ::cuBQL::BuildConfig(),
+                                0,
+                                memResource);
+#endif
+      // std::cout << "#dpr: ... bvh built." << std::endl;
+      
+      cudaFree(d_primBounds);
+      CUBQL_CUDA_SYNC_CHECK();
+    }
+  
+    TrianglesGroup::~TrianglesGroup()
+    {
+      CUBQL_CUDA_SYNC_CHECK();
+      cudaFree(d_meshDDs);
+      cudaFree(d_primRefs);
+      ::cuBQL::cuda::free(bvh);
+      CUBQL_CUDA_SYNC_CHECK();
+    }
+
+  }
+}
+
+    
diff --git a/dp/cuBQL/Triangles.h b/dp/cuBQL/Triangles.h
new file mode 100644
index 0000000..cf56357
--- /dev/null
+++ b/dp/cuBQL/Triangles.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include "dp/cuBQL/CuBQLBackend.h"
+#include "dp/Triangles.h"
+
+namespace dp {
+  namespace cubql_cuda {
+
+    /*! a single triangle mesh; can be created over pointes that are
+        either on host or device, but which definitively stores
+        vertices on the device */
+    struct TriangleMesh : public dp::TriangleMesh {
+      struct DD {
+        inline __cubql_both TriangleDP getTriangle(uint32_t primID) const;
+        
+        const vec3d   *vertices;
+        const vec3i   *indices;
+        uint64_t userData;
+      };
+
+      TriangleMesh(Context         *context,
+                   uint64_t         userData,
+                   const vec3d     *vertexArray,
+                   int              vertexCount,
+                   const vec3i     *indexArray,
+                   int              indexCount);
+      
+      DD getDD() const
+      { return { vertices.elements, indices.elements, userData }; }
+    
+      AutoUploadArray<vec3d> vertices;
+      AutoUploadArray<vec3i> indices;
+    };
+
+    
+    /*! a group/acceleration structure over one or more triangle meshes */
+    struct TrianglesGroup : public dp::TrianglesGroup {
+      /*! device data for a cubql group over one or more triangle
+          meshes */
+      struct DD {
+        /*! return the triangle specified by the given primref */
+        inline __cubql_both DD() = default;
+        inline __cubql_both TriangleDP getTriangle(PrimRef prim) const;
+        
+        TriangleMesh::DD *meshes;
+        PrimRef          *primRefs;
+        bvh3d             bvh;
+      };
+      
+      TrianglesGroup(Context *context,
+                     const std::vector<dp::TriangleMesh *> &geoms);
+      ~TrianglesGroup() override;
+
+
+      DD getDD() const
+      {
+        DD dd;
+        dd.meshes = d_meshDDs;
+        dd.primRefs = d_primRefs;
+        dd.bvh = bvh;
+        return dd;
+      }
+      
+      bvh3d             bvh;
+      TriangleMesh::DD *d_meshDDs;
+      PrimRef          *d_primRefs;
+    };
+
+    inline __cubql_both
+    TriangleDP TriangleMesh::DD::getTriangle(uint32_t primID) const
+    {
+      vec3i idx = indices[primID];
+      TriangleDP tri;
+      tri.a = vertices[idx.x];
+      tri.b = vertices[idx.y];
+      tri.c = vertices[idx.z];
+      return tri;
+    }
+
+    inline __cubql_both
+    TriangleDP TrianglesGroup::DD::getTriangle(PrimRef prim) const
+    {
+      const TriangleMesh::DD &mesh = meshes[prim.geomID];
+      TriangleDP tri = mesh.getTriangle(prim.primID);
+      return tri;
+    }
+    
+  }
+}
diff --git a/dp/deepeeRT.cpp b/dp/deepeeRT.cpp
index 1cd8454..7624ba8 100644
--- a/dp/deepeeRT.cpp
+++ b/dp/deepeeRT.cpp
@@ -7,15 +7,22 @@
 #include "dp/Triangles.h"
 #include "dp/Group.h"
 #include "dp/World.h"
+#include "dp/cuBQL/CuBQLBackend.h"
 
 namespace dp {
+
+  Context *Context::create(int gpuID)
+  {
+    return new cubql_cuda::CuBQLCUDABackend(gpuID);
+  };
+  
 } // ::dp
 
 DPR_API
 DPRContext dprContextCreate(DPRContextType contextType,
                             int gpuToUse)
 {
-  return (DPRContext)new dp::Context(gpuToUse);
+  return (DPRContext)dp::Context::create(gpuToUse);
 }
 
 DPR_API
@@ -27,20 +34,20 @@ DPRTriangles dprCreateTrianglesDP(DPRContext _context,
                                     yielded the intersection.  */
                                   uint64_t userData,
                                   /*! device array of vertices */
-                                  DPRvec3 *d_vertexArray,
+                                  DPRvec3 *vertexArray,
                                   size_t   vertexCount,
                                   /*! device array of int3 vertex indices */
-                                  DPRint3 *d_indexArray,
+                                  DPRint3 *indexArray,
                                   size_t   indexCount)
 {
   dp::Context *context = (dp::Context *)_context;
   assert(context);
-  return (DPRTriangles)new dp::TrianglesDP(context,
-                                           userData,
-                                           (const dp::vec3d*)d_vertexArray,
-                                           vertexCount,
-                                           (const dp::vec3i*)d_indexArray,
-                                           indexCount);
+  return (DPRTriangles)context->
+    createTriangleMesh(userData,
+                       (const dp::vec3d*)vertexArray,
+                       vertexCount,
+                       (const dp::vec3i*)indexArray,
+                       indexCount);
 }
 
 DPR_API
@@ -50,36 +57,33 @@ DPRGroup dprCreateTrianglesGroup(DPRContext   _context,
 {
   dp::Context *context = (dp::Context *)_context;
   assert(context);
-  std::vector<dp::TrianglesDP*> geoms;
+  std::vector<dp::TriangleMesh*> geoms;
   for (int i=0;i<(int)triangleGeomsCount;i++) {
-    dp::TrianglesDP *geom = (dp::TrianglesDP *)triangleGeomsArray[i];
+    dp::TriangleMesh *geom = (dp::TriangleMesh *)triangleGeomsArray[i];
     assert(geom);
     assert(geom->context == context);
     geoms.push_back(geom);
   }
-  return (DPRGroup)new dp::TrianglesDPGroup(context,geoms);
+  return (DPRGroup)context->createTrianglesGroup(geoms);
 }
 
-
 DPR_API
 DPRWorld dprCreateWorldDP(DPRContext _context,
                           DPRGroup   *instanceGroups,
-                          DPRAffine  *d_instanceTransforms,
+                          DPRAffine  *instanceTransforms,
                           size_t      instanceCount)
 {
   dp::Context *context = (dp::Context *)_context;
   assert(context);
   
-  assert(instanceCount == 1 && "instancing not yet implemented");
-  assert(d_instanceTransforms == nullptr && "instancing not yet implemented");
-  
-  std::vector<dp::Group *> groups;
+  std::vector<dp::TrianglesGroup *> groups;
   for (int i=0;i<(int)instanceCount;i++) {
-    dp::Group *group = (dp::Group *)instanceGroups[i];
+    dp::TrianglesGroup *group = (dp::TrianglesGroup *)instanceGroups[i];
     assert(group);
     groups.push_back(group);
   }
-  return (DPRWorld)new dp::InstancesDPGroup(context,groups,d_instanceTransforms);
+  return (DPRWorld)context->
+    createInstanceGroup(groups,instanceTransforms);
 }
 
 DPR_API
@@ -91,15 +95,41 @@ void dprTrace(/*! the world we want the rays to be traced against */
               DPRHit *d_hits,
               /*! number of rays that need tracing. d_rays and
                 d_hits *must* have (at least) that many entires */
-              int numRays)
+              int numRays,
+              uint64_t flags)
 {
-  dp::InstancesDPGroup *world = (dp::InstancesDPGroup *)_world;
+  dp::InstanceGroup *world = (dp::InstanceGroup *)_world;
   assert(world);
   assert(d_hits);
   assert(d_rays);
   assert(numRays > 0);
-  world->traceRays(d_rays,d_hits,numRays);
+  world->traceRays(d_rays,d_hits,numRays,flags);
+}
+
+DPR_API void dprFreeWorld(DPRWorld world)
+{
+  assert(world);
+  delete (dp::InstanceGroup *)world;
+}
+    
+DPR_API void dprFreeTriangles(DPRTriangles triangles)
+{
+  assert(triangles);
+  delete (dp::TriangleMesh *)triangles;
+}
+    
+DPR_API void dprFreeGroup(DPRGroup group)
+{
+  assert(group);
+  delete (dp::TrianglesGroup *)group;
+}
+    
+DPR_API void dprFreeContext(DPRContext context)
+{
+  assert(context);
+  delete (dp::Context *)context;
 }
+    
 
 
 
diff --git a/dp/deepeeRTConfig.cmake.in b/dp/deepeeRTConfig.cmake.in
new file mode 100644
index 0000000..ac740ad
--- /dev/null
+++ b/dp/deepeeRTConfig.cmake.in
@@ -0,0 +1,42 @@
+## SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+## SPDX-License-Identifier: Apache-2.0
+
+# originally copied-then-adapted-from the following:
+
+## Copyright 2021-2024 The Khronos Group
+## SPDX-License-Identifier: 2-Apache.0
+
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_dependency(Threads)
+
+include(${CMAKE_CURRENT_LIST_DIR}/deepeeRT-config.cmake)
+
+if (NOT TARGET deepeeRT::deepeeRT)
+  message(FATAL_ERROR "CMAKE_PREFIX_PATH or deepeeRT_DIR are pointing to a \
+  deepeeRT build directory. Please do a full install of deepeeRT \
+  (e.g. 'make install') and point to where you installed it \
+  (CMAKE_INSTALL_PREFIX in your build of deepeeRT). \
+  Consuming deepeeRT from a build directory is not supported.")
+endif()
+set(deepeeRT_FOUND ON)
+
+set(DEEPEERT_DATAROOTDIR
+  ${CMAKE_CURRENT_LIST_DIR}/../../../@CMAKE_INSTALL_DATAROOTDIR@/deepeeRT
+  )
+
+foreach(component ${deepeeRT_FIND_COMPONENTS})
+  # For requested component, execute its "config" script
+  message("cfind component ${component}")
+  if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/${component}-config.cmake")
+    include(${CMAKE_CURRENT_LIST_DIR}/${component}-config.cmake)
+    set(${component}_FOUND ON)
+  else()
+    set(${component}_FOUND OFF)
+  endif()
+endforeach()
+
+check_required_components(@PROJECT_NAME@)
diff --git a/include/deepee/deepee.h b/include/deepeeRT/deepeeRT.in.h
similarity index 71%
rename from include/deepee/deepee.h
rename to include/deepeeRT/deepeeRT.in.h
index 1d62127..3c290eb 100644
--- a/include/deepee/deepee.h
+++ b/include/deepeeRT/deepeeRT.in.h
@@ -44,6 +44,11 @@
 #include <sys/types.h>
 #include <stdint.h>
 
+#define DEEPEERT_VERSION_MAJOR @DEEPEERT_VERSION_MAJOR@
+#define DEEPEERT_VERSION_MINOR @DEEPEERT_VERSION_MINOR@
+#define DEEPEERT_VERSION_PATCH @DEEPEERT_VERSION_PATCH@
+
+
 #ifdef _WIN32
 // for now, we only support a static build for this library
 # if 1 || defined(deepeeRT_STATIC)
@@ -77,6 +82,14 @@ typedef struct _DPRContext       *DPRContext;
 
 typedef enum { DPR_CONTEXT_GPU } DPRContextType;
 
+#define DPR_FLAGS_NONE (uint64_t(0))
+/*! if enabled, we will skip all intersections with triangles whose
+    normal faces TOWARDS the origin (ie, cull iff dot(ray.dir,N)<0) */
+#define DPR_CULL_FRONT (uint64_t(1ull<<0))
+/*! if enabled, we will skip all intersections with triangles whose
+    normal faces AWAY the origin (ie, cull iff dot(ray.dir,N)>0) */
+#define DPR_CULL_BACK  (uint64_t(1ull<<1))
+
 struct DPRint3 { int32_t x,y,z; };
 struct DPRvec3 { double  x,y,z; };
 struct DPRvec4 { double  x,y,z,w; };
@@ -113,7 +126,10 @@ struct DPRHit {
     instID this is *not* a linear ID, but whatever int64 value the
     user specified there. */
   uint64_t geomUserData;
-  double  tHit;
+  union {
+    double  tHit; // << old (pre v0.2) spelling
+    double t;
+  };
   double  u, v;
 };
 
@@ -142,17 +158,29 @@ DPRTriangles dprCreateTrianglesDP(DPRContext context,
                                   DPRint3 *indexArray,
                                   size_t   indexCount);
 
+/*! create an object representing a group of one or more triangle
+    meshes that can then get instantiated (dpr never directly
+    instantiates individual triangle meshes, but always groups of
+    meshes. If you need to instantiate a single mesh you need to first
+    create a TrianglesGroup with that single mesh, then instantiate
+    this). */
 DPR_API
 DPRGroup dprCreateTrianglesGroup(DPRContext,
                                  DPRTriangles *triangleGeomsArray,
                                  size_t        triangleGeomsCount);
 
+/*! creates a world over one or more triangle mesh groups; each
+    instance is defined by a handle to the group it wants to
+    instantaite, plus an associated transform that represents the
+    object-to-world transform supposed to be applied to this
+    geometry. */
 DPR_API
 DPRWorld dprCreateWorldDP(DPRContext,
                           DPRGroup  *instanceGroups,
                           DPRAffine *instanceTransforms,
                           size_t     instanceCount);
 
+/*! traces a set of rays against a previously computed world. */
 DPR_API
 void dprTrace(/*! the world we want the rays to be traced against */
               DPRWorld world,
@@ -162,11 +190,33 @@ void dprTrace(/*! the world we want the rays to be traced against */
               DPRHit *d_hits,
               /*! number of rays that need tracing. d_rays and
                 d_hits *must* have (at least) that many entires */
-              int numRays);
+              int numRays,
+              uint64_t flags = 0ull);
 
+/*! frees a previously created world. This should also free all the
+    memory that this world object has created for internal
+    acceleration structures, but will NOT free the groups that it was
+    created over. It is user's job to free those appropriately */
 DPR_API void dprFreeWorld(DPRWorld world);
+
+/*! frees a previously created triangle mesh. This should also free
+    all the memory that this group has created for internal storage of
+    the triangles it was create over. Once freed world objects created
+    over this triangle mesh (or over objects created over this
+    triangle mesh) are no longer valid and may no longer get traced
+    against. */
 DPR_API void dprFreeTriangles(DPRTriangles triangles);
+
+/*! frees a previously created triangle mesh group. This should also
+    free all the memory that this group has created for internal
+    acceleration structures, but will NOT free the triangle meshes
+    that it was created over. It is user's job to free those
+    appropriately. Once freed world objects created over this group
+    are no longer valid and may no longer get traced against. */
 DPR_API void dprFreeGroup(DPRGroup group);
+
+/*! frees the root context. This is currently NOT guaranteed to free
+  all the objects created within this context. */
 DPR_API void dprFreeContext(DPRContext context);
 
 
diff --git a/miniapp/CMakeLists.txt b/miniapp/CMakeLists.txt
index a0aec97..995bc27 100644
--- a/miniapp/CMakeLists.txt
+++ b/miniapp/CMakeLists.txt
@@ -11,3 +11,13 @@ add_executable(dpMiniApp
 )
 target_link_libraries(dpMiniApp PUBLIC deepeeRT)
 
+add_executable(dpInstanceMiniApp
+  DGEF.h
+  DGEF.cpp
+  import_OBJ.cpp
+  Camera.h
+  Camera.cu
+  instanceMiniApp.cu
+)
+target_link_libraries(dpInstanceMiniApp PUBLIC deepeeRT)
+
diff --git a/miniapp/Camera.cu b/miniapp/Camera.cu
index 5e72d80..d67f74f 100644
--- a/miniapp/Camera.cu
+++ b/miniapp/Camera.cu
@@ -23,7 +23,7 @@ namespace miniapp {
 
     dv *= scale;
     du *= scale*aspect;
-    
+
 #if 0
     // for testing: this is a ortho camera with parallel rays and
     // different origins all n a plane
diff --git a/miniapp/Camera.h b/miniapp/Camera.h
index 6183aa6..85cf3c0 100644
--- a/miniapp/Camera.h
+++ b/miniapp/Camera.h
@@ -24,7 +24,8 @@ namespace miniapp {
     /*! generate ray for a given pixel/image plane coordinate. based on
       how the camera was created this could be either a orthogonal or
       a perspective camera (see Camera.cpp) */
-    inline __device__ Ray generateRay(vec2d pixel, bool dbg=false) const;
+    inline __device__
+    Ray generateRay(vec2d pixel, bool dbg=false) const;
 
     struct {
       vec3d v,du,dv;
@@ -45,6 +46,12 @@ namespace miniapp {
     Ray ray;
     ray.origin
       = origin.v+pixel.x*origin.du+pixel.y*origin.dv;
+    if (dbg)
+      printf("camera org %f %f %f\n",
+             (float)origin.v.x,
+             (float)origin.v.y,
+             (float)origin.v.z);
+
     ray.direction
       = normalize(direction.v+pixel.x*direction.du+pixel.y*direction.dv);
     ray.tMin = 0.;
diff --git a/miniapp/DGEF.cpp b/miniapp/DGEF.cpp
new file mode 100644
index 0000000..249bb55
--- /dev/null
+++ b/miniapp/DGEF.cpp
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "DGEF.h"
+#include <fstream>
+
+namespace miniapp {
+  namespace dgef {
+    
+    Scene *Scene::load(const std::string &fileName)
+    {
+      std::ifstream in(fileName.c_str(),std::ios::binary);
+
+      std::vector<Mesh *> meshes;
+      Scene *scene = new Scene;
+      
+      size_t header;
+      in.read((char*)&header,sizeof(header));
+      
+      bool isProperDGEF = (header == 0xdefdefdefULL);
+      size_t numMeshes;
+      in.read((char*)&numMeshes,sizeof(numMeshes));
+      for (int meshID=0;meshID<numMeshes;meshID++) {
+        Mesh *_mesh = new Mesh;
+        meshes.push_back(_mesh);
+        Mesh &mesh = *_mesh;
+      
+        size_t count;
+        in.read((char*)&count,sizeof(count));
+        mesh.vertices.resize(count);
+        in.read((char*)mesh.vertices.data(),
+                count*sizeof(vec3d));
+      
+        in.read((char*)&count,sizeof(count));
+        for (size_t i=0;i<count;i++) {
+          vec3ul idx;
+          in.read((char*)&idx,sizeof(idx));
+          mesh.indices.push_back({(int)idx.x,(int)idx.y,(int)idx.z});
+        }
+      }
+
+      if (!isProperDGEF) {
+        Object *object = new Object;
+        object->meshes = meshes;
+        Instance *inst = new Instance;
+        inst->xfm = affine3d();
+        inst->object = object;
+        scene->instances.push_back(inst);
+        return scene;
+      }
+      
+      size_t numObjects;
+      in.read((char*)&numObjects,sizeof(numObjects));
+      int meshBegin = 0;
+      std::vector<Object *> objects;
+      for (int i=0;i<numObjects;i++) {
+        int count;
+        in.read((char*)&count,sizeof(count));
+        Object *object = new Object;
+        for (int i=0;i<count;i++)
+          object->meshes.push_back(meshes[meshBegin++]);
+        objects.push_back(object);
+      }
+      
+      size_t numInstances;
+      in.read((char*)&numInstances,sizeof(numInstances));
+      for (int instID=0;instID<numInstances;instID++) {
+        Instance *inst = new Instance;
+        in.read((char*)&inst->xfm,sizeof(inst->xfm));
+        int objectID;
+        in.read((char*)&objectID,sizeof(objectID));
+        inst->object = objects[objectID];
+        scene->instances.push_back(inst);
+      }
+      return scene;
+    }
+
+    box3d Scene::bounds() const
+    {
+      std::cout << "#dpm: computing scene bounds" << std::endl;
+      box3d bounds;
+      for (auto inst : instances)
+        for (auto m : inst->object->meshes)
+          for (auto v : m->vertices)
+            bounds.extend(xfmPoint(inst->xfm,v));
+      std::cout << "#dpm: bounds = " << bounds << std::endl;
+      return bounds;
+    }
+
+  }
+}
+
diff --git a/miniapp/DGEF.h b/miniapp/DGEF.h
new file mode 100644
index 0000000..3e741d1
--- /dev/null
+++ b/miniapp/DGEF.h
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "deepeeRT/deepeeRT.h"
+#include "cuBQL/math/box.h"
+#include "cuBQL/math/linear.h"
+#include "cuBQL/math/affine.h"
+#include <cuda_runtime.h>
+
+#include <vector>
+#include <map>
+#include <memory>
+#include <string>
+#include <iostream>
+
+namespace miniapp {
+  using namespace cuBQL;
+  namespace dgef {
+    struct Mesh {
+      std::vector<vec3d> vertices;
+      std::vector<vec3i> indices;
+    };
+    struct Object {
+      std::vector<Mesh *> meshes;
+    };
+    struct Instance {
+      affine3d xfm;
+      Object *object;
+    };
+    struct Scene {
+      static Scene *load(const std::string &fileName);
+      std::vector<Instance *> instances;
+      box3d bounds() const;
+    };
+  }
+}
diff --git a/miniapp/Mesh.cpp b/miniapp/Mesh.cpp
index d7ac0b4..1161e8f 100644
--- a/miniapp/Mesh.cpp
+++ b/miniapp/Mesh.cpp
@@ -4,9 +4,11 @@
 #include "Mesh.h"
 #include <fstream>
 #include <random>
+#include <cuBQL/math/affine.h>
 
 namespace miniapp {
-
+  using cuBQL::affine3d;
+  
   box3d Mesh::bounds()
   {
     box3d bb;
@@ -55,6 +57,8 @@ namespace miniapp {
       load_obj(fileName);
     else if (ext == ".binmesh")
       load_binmesh(fileName);
+    else if (ext == ".dgef")
+      load_dgef(fileName);
     else
       throw std::runtime_error("un-recognized or un-supported file extension '"+ext+"'");
   }
@@ -80,6 +84,52 @@ namespace miniapp {
     in.read((char*)indices.data(),numIndices*sizeof(indices[0]));
   }
   
+
+  void Mesh::load_dgef(const std::string &fileName)
+  {
+    vertices.clear();
+    indices.clear();
+    
+    std::ifstream in(fileName.c_str(),std::ios::binary);
+
+    size_t header;
+    in.read((char*)&header,sizeof(header));
+
+    size_t numMeshes;
+    in.read((char*)&numMeshes,sizeof(numMeshes));
+    std::vector<Mesh> meshes(numMeshes);
+    for (int meshID=0;meshID<numMeshes;meshID++) {
+      Mesh &mesh = meshes[meshID];
+      
+      size_t count;
+      in.read((char*)&count,sizeof(count));
+      mesh.vertices.resize(count);
+
+      in.read((char*)mesh.vertices.data(),
+              count*sizeof(vec3d));
+      
+      in.read((char*)&count,sizeof(count));
+      for (size_t i=0;i<count;i++) {
+        vec3ul idx;
+        in.read((char*)&idx,sizeof(idx));
+        mesh.indices.push_back({(int)idx.x,(int)idx.y,(int)idx.z});
+      }
+    }
+    
+    size_t numInstances;
+    in.read((char*)&numInstances,sizeof(numInstances));
+    for (int instID=0;instID<numInstances;instID++) {
+      affine3d xfm;
+      in.read((char*)&xfm,sizeof(xfm)); 
+      size_t meshID;
+      in.read((char*)&meshID,sizeof(meshID)); 
+    }
+
+    this->vertices = meshes[0].vertices;
+    this->indices = meshes[0].indices;
+  }
+  
+
   void Mesh::translate(vec3d delta)
   {
     for (auto &v : vertices)
diff --git a/miniapp/Mesh.h b/miniapp/Mesh.h
index 1010c17..33cb92d 100644
--- a/miniapp/Mesh.h
+++ b/miniapp/Mesh.h
@@ -1,9 +1,10 @@
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
-#include "deepee/deepee.h"
+#include "deepeeRT/deepeeRT.h"
 #include "cuBQL/math/box.h"
 #include "cuBQL/math/linear.h"
 #include <cuda_runtime.h>
@@ -29,6 +30,7 @@ namespace miniapp {
   private:
     void load_binmesh(const std::string &fileName);
     void load_obj(const std::string &fileName);
+    void load_dgef(const std::string &fileName);
   };
   
   /*! helper function that creates a mesh with a terrain-like shape,
diff --git a/miniapp/instanceMiniApp.cu b/miniapp/instanceMiniApp.cu
new file mode 100644
index 0000000..786249d
--- /dev/null
+++ b/miniapp/instanceMiniApp.cu
@@ -0,0 +1,250 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "Camera.h"
+#include "DGEF.h"
+#include <fstream>
+
+namespace miniapp {
+
+  /*! helper function that creates a semi-random color from an ID */
+  inline __cubql_both vec3f randomColor(int i)
+  {
+    const uint64_t FNV_offset_basis = 0xcbf29ce484222325ULL;
+    const uint64_t FNV_prime = 0x10001a7;
+    uint32_t v = (uint32_t)FNV_offset_basis;
+    v = FNV_prime * v ^ i;
+    v = FNV_prime * v ^ i;
+    v = FNV_prime * v ^ i;
+    v = FNV_prime * v ^ i;
+
+    int r = v >> 24;
+    v = FNV_prime * v ^ i;
+    int b = v >> 16;
+    v = FNV_prime * v ^ i;
+    int g = v >> 8;
+    return vec3f((r&255)/255.f,
+                 (g&255)/255.f,
+                 (b&255)/255.f);
+  }
+
+  void getFrame(std::string up,
+                vec3d &dx,
+                vec3d &dy,
+                vec3d &dz)
+  {
+    if (up == "z") {
+      dx = {1.,0.,0.};
+      dy = {0.,1.,0.};
+      dz = {0.,0.,1.};
+      return;
+    }
+    if (up == "y") {
+      dx = {1.,0.,0.};
+      dz = {0.,1.,0.};
+      dy = {0.,0.,1.};
+      return;
+    }
+    throw std::runtime_error("unhandled 'up'-specifier of '"+up+"'");
+  }
+  
+  DPRWorld createWorld(DPRContext context,
+                       dgef::Scene *scene)
+  {
+    std::map<dgef::Object *, DPRGroup> objects;
+    CUBQL_CUDA_SYNC_CHECK();
+
+    for (auto inst : scene->instances)
+      objects[inst->object] = 0;
+
+    std::cout << "#dpm: creating " << objects.size() << " objects" << std::endl;
+    int meshID = 0;
+    for (auto &pairs : objects) {
+      auto obj = pairs.first;
+      std::vector<DPRTriangles> geoms;
+      for (auto pm : obj->meshes) {
+        std::cout << "#dpm: creating dpr triangle mesh w/ "
+                  << prettyNumber(pm->indices.size()) << " triangles"
+                  << std::endl;
+        DPRTriangles geom
+          = dprCreateTrianglesDP(context,
+                                 meshID++,
+                                 (DPRvec3*)pm->vertices.data(),
+                                 pm->vertices.size(),
+                                 (DPRint3*)pm->indices.data(),
+                                 pm->indices.size());
+        CUBQL_CUDA_SYNC_CHECK();
+        geoms.push_back(geom);
+      }
+      CUBQL_CUDA_SYNC_CHECK();
+      
+      DPRGroup group = dprCreateTrianglesGroup(context,
+                                               geoms.data(),
+                                               geoms.size());
+      objects[obj] = group;
+    }
+    CUBQL_CUDA_SYNC_CHECK();
+    
+    std::cout << "#dpm: creating dpr world" << std::endl;
+    std::vector<affine3d> xfms;
+    std::vector<DPRGroup> groups;
+    for (auto inst : scene->instances) {
+      xfms.push_back(inst->xfm);
+      groups.push_back(objects[inst->object]);
+    }
+    DPRWorld world = dprCreateWorldDP(context,
+                                      groups.data(),
+                                      (DPRAffine*)xfms.data(),
+                                      groups.size());
+    CUBQL_CUDA_SYNC_CHECK();
+    return world;
+  }
+
+
+  __global__
+  void g_shadeRays(vec4f *d_pixels,
+                   DPRRay *d_rays,
+                   DPRHit *d_hits,
+                   vec2i fbSize)
+  {
+    int ix = threadIdx.x+blockIdx.x*blockDim.x;
+    int iy = threadIdx.y+blockIdx.y*blockDim.y;
+    
+    if (ix >= fbSize.x) return;
+    if (iy >= fbSize.y) return;
+
+    //Ray ray = (const Ray &)d_rays[ix+iy*fbSize.x];
+    DPRHit hit = d_hits[ix+iy*fbSize.x];
+    vec3f color = randomColor(hit.primID + 0x290374*hit.geomUserData);
+    vec4f pixel = {color.x,color.y,color.z,1.f};
+    int tid = ix+iy*fbSize.x;
+    d_pixels[tid] = pixel;
+  }
+  
+  __global__
+  void g_generateRays(DPRRay *d_rays,
+                      vec2i fbSize,
+                      const Camera camera)
+  {
+    static_assert(sizeof(DPRRay) == sizeof(Ray));
+    
+    int ix = threadIdx.x+blockIdx.x*blockDim.x;
+    int iy = threadIdx.y+blockIdx.y*blockDim.y;
+    
+    if (ix >= fbSize.x) return;
+    if (iy >= fbSize.y) return;
+
+    double u = ix+.5;
+    double v = iy+.5;
+
+    bool dbg = false;//ix == 512 && iy == 512;
+    vec2d pixel = {u,v};
+    Ray ray = camera.generateRay(pixel,dbg);
+
+    int rayID = ix+iy*fbSize.x;
+    if (dbg)
+      printf("ray %f %f %f : %f %f %f\n",
+             (float)ray.origin.x,
+             (float)ray.origin.y,
+             (float)ray.origin.z,
+             (float)ray.direction.x,
+             (float)ray.direction.y,
+             (float)ray.direction.z);
+    ((Ray *)d_rays)[rayID] = ray;
+  }
+  
+  void main(int ac, char **av)
+  {
+    std::string inFileName;
+    std::string outFileName = "deepeeTest.ppm";
+    vec2i fbSize = { 1024,1024 };
+    uint64_t flags = 0;
+    for (int i=1;i<ac;i++) {
+      std::string arg = av[i];
+      if (arg[0] != '-') {
+        inFileName = arg;
+      } else if (arg == "-bc" || arg == "--backface-culling") {
+        flags |= DPR_CULL_BACK;
+      } else if (arg == "-fc" || arg == "--frontface-culling") {
+        flags |= DPR_CULL_FRONT;
+      } else if (arg == "-or" || arg == "--output-res") {
+        fbSize.x = std::stoi(av[++i]);
+        fbSize.y = std::stoi(av[++i]);
+      } else
+        throw std::runtime_error("un-recognized cmdline arg '"+arg+"'");
+    }
+    if (inFileName.empty())
+      throw std::runtime_error("no input file name specified");
+
+    dgef::Scene *scene = dgef::Scene::load(inFileName);
+
+    box3d bounds = scene->bounds();
+    Camera camera = generateCamera(fbSize,
+                                   /* bounds to focus on */
+                                   bounds,
+                                   /* point we're looking from*/
+                                   length(bounds.size())*vec3d(2,1,4),
+                                   /* up for orientation */
+                                   vec3d(0,1,0));
+
+    vec2i bs(16,16);
+    vec2i nb = divRoundUp(fbSize,bs);
+    
+    std::cout << "#dpm: creating dpr context" << std::endl;
+    DPRContext dpr = dprContextCreate(DPR_CONTEXT_GPU,0);
+    std::cout << "#dpm: creating world" << std::endl;
+    DPRWorld world = createWorld(dpr,scene);
+
+    CUBQL_CUDA_SYNC_CHECK();
+    DPRRay *d_rays = 0;
+    cudaMalloc((void **)&d_rays,fbSize.x*fbSize.y*sizeof(DPRRay));
+    CUBQL_CUDA_SYNC_CHECK();
+    g_generateRays<<<nb,bs>>>(d_rays,fbSize,camera);
+    CUBQL_CUDA_SYNC_CHECK();
+      
+    DPRHit *d_hits = 0;
+    cudaMalloc((void **)&d_hits,fbSize.x*fbSize.y*sizeof(DPRHit));
+
+    CUBQL_CUDA_SYNC_CHECK();
+    std::cout << "#dpm: calling trace" << std::endl;
+    dprTrace(world,d_rays,d_hits,fbSize.x*fbSize.y,flags);
+
+    std::cout << "#dpm: shading rays" << std::endl;
+    vec4f *m_pixels = 0;
+    cudaMallocManaged((void **)&m_pixels,fbSize.x*fbSize.y*sizeof(vec4f));
+    g_shadeRays<<<nb,bs>>>(m_pixels,d_rays,d_hits,fbSize);
+    cudaStreamSynchronize(0);
+
+
+    std::cout << "#dpm: writing test image to " << outFileName << std::endl;
+    std::ofstream out(outFileName.c_str());
+
+    char buf[100];
+    sprintf(buf,"P3\n#deepee test image\n%i %i 255\n",fbSize.x,fbSize.y);
+    out << "P3\n";
+    out << "#deepeeRT test image\n";
+    out << fbSize.x << " " << fbSize.y << " 255" << std::endl;
+    for (int iy=0;iy<fbSize.y;iy++) {
+      for (int ix=0;ix<fbSize.x;ix++) {
+        vec4f pixel = m_pixels[ix+(fbSize.y-1-iy)*fbSize.x];
+        auto write = [&](float f) {
+          f = f*256.f;
+          f = std::min(f,255.f);
+          f = std::max(f,0.f);
+          out << int(f) << " ";
+        };
+        write(pixel.x);
+        write(pixel.y);
+        write(pixel.z);
+        out << " ";
+      }
+      out << "\n";
+    }
+  }
+}
+
+int main(int ac, char **av)
+{
+  miniapp::main(ac,av);
+  return 0;
+}
diff --git a/miniapp/miniapp.cu b/miniapp/miniapp.cu
index a159929..64a74f0 100644
--- a/miniapp/miniapp.cu
+++ b/miniapp/miniapp.cu
@@ -51,6 +51,7 @@ namespace miniapp {
   DPRWorld createWorld(DPRContext context,
                        const std::vector<Mesh *> &meshes)
   {
+      CUBQL_CUDA_SYNC_CHECK();
     std::vector<DPRTriangles> geoms;
     int meshID = 0;
     for (auto pm : meshes) {
@@ -66,18 +67,23 @@ namespace miniapp {
                                                pm->vertices.size(),
                                                (DPRint3*)d_indices,
                                                pm->indices.size());
+      CUBQL_CUDA_SYNC_CHECK();
       geoms.push_back(geom);
     }
+    CUBQL_CUDA_SYNC_CHECK();
     std::cout << "#dpm: creating dpr triangles group w/ "
               << geoms.size() << " meshes" << std::endl;
     DPRGroup group = dprCreateTrianglesGroup(context,
                                              geoms.data(),
                                              geoms.size());
+    CUBQL_CUDA_SYNC_CHECK();
+    
     std::cout << "#dpm: creating dpr world" << std::endl;
     DPRWorld world = dprCreateWorldDP(context,
                                       &group,
                                       nullptr,
                                       1);
+      CUBQL_CUDA_SYNC_CHECK();
     return world;
   }
 
@@ -118,16 +124,25 @@ namespace miniapp {
     double u = ix+.5;
     double v = iy+.5;
 
+    bool dbg = false;//ix == 512 && iy == 512;
     vec2d pixel = {u,v};
-    Ray ray = camera.generateRay(pixel,false);
+    Ray ray = camera.generateRay(pixel,dbg);
 
     int rayID = ix+iy*fbSize.x;
+    if (dbg)
+      printf("ray %f %f %f : %f %f %f\n",
+             (float)ray.origin.x,
+             (float)ray.origin.y,
+             (float)ray.origin.z,
+             (float)ray.direction.x,
+             (float)ray.direction.y,
+             (float)ray.direction.z);
     ((Ray *)d_rays)[rayID] = ray;
   }
   
   void main(int ac, char **av)
   {
-    double scale = 1;
+    double scale = 3;
     std::string up = "y";
     std::string inFileName;
     std::string outFileName = "deepeeTest.ppm";
@@ -167,7 +182,7 @@ namespace miniapp {
                                    /* bounds to focus on */
                                    object.bounds(),
                                    /* point we're looking from*/
-                                   -1.*scale*(dx+dy)+.5*scale*dz,
+                                   -2.*scale*(dx+dy)+scale*dz,
                                    /* up for orientation */
                                    dz);
 
diff --git a/submodules/cuBQL b/submodules/cuBQL
index d18c5fa..9208c7e 160000
--- a/submodules/cuBQL
+++ b/submodules/cuBQL
@@ -1 +1 @@
-Subproject commit d18c5fa1a5c98665d13484841ae65774da7751e8
+Subproject commit 9208c7e5055f9c0669eb428e9ad97229ca2b5674