diff --git a/CMakeLists.txt b/CMakeLists.txt index c4ae16c..f6603ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +set(DEEPEERT_VERSION_MAJOR 0) +set(DEEPEERT_VERSION_MINOR 3) +set(DEEPEERT_VERSION_PATCH 2) +set(DEEPEERT_VERSION ${DEEPEERT_VERSION_MAJOR}.${DEEPEERT_VERSION_MINOR}.${DEEPEERT_VERSION_PATCH}) + set(CMAKE_BUILD_TYPE_INIT "Release") set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_CXX_STANDARD 20) @@ -8,7 +13,7 @@ set(CMAKE_CUDA_ARCHITECTURES native) cmake_minimum_required(VERSION 3.16) cmake_policy(SET CMP0048 NEW) -project(DeePeeRT VERSION 0.1.0 LANGUAGES C CXX CUDA) +project(deepeeRT VERSION ${DEEPEERT_VERSION} LANGUAGES C CXX CUDA) if(NOT SET_UP_CONFIGURATIONS_DONE) set(SET_UP_CONFIGURATIONS_DONE 1) @@ -33,14 +38,91 @@ if ((${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})) SET(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) endif() -option(DEEPEE_USE_LOCAL_CUBQL "Use *local* cuBQL" OFF) -mark_as_advanced(DEEPEE_USE_LOCAL_CUBQL) -if (DEEPEE_USE_LOCAL_CUBQL) +option(DEEPEERT_USE_LOCAL_CUBQL "Use *local* cuBQL" OFF) +mark_as_advanced(DEEPEERT_USE_LOCAL_CUBQL) +if (DEEPEERT_USE_LOCAL_CUBQL) add_subdirectory(../cuBQL EXCLUDE_FROM_ALL builddir_cuBQL) else() add_subdirectory(submodules/cuBQL EXCLUDE_FROM_ALL builddir_cuBQL) endif() add_subdirectory(dp) -add_subdirectory(miniapp) + +if (NOT (${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})) + # don't build miniapp if this is a subproject +else() +# add_subdirectory(miniapp) +endif() + + +# ================================================================== +# install target(s) +# ================================================================== + +include(GNUInstallDirs) + +set(DEEPEERT_CMAKE_INSTALL_DESTINATION + ${CMAKE_INSTALL_LIBDIR}/cmake/deepeeRT-${DEEPEERT_VERSION}#${PROJECT_VERSION} +) + +include(CMakePackageConfigHelpers) + +configure_package_config_file( + "${PROJECT_SOURCE_DIR}/dp/deepeeRTConfig.cmake.in" + "${PROJECT_BINARY_DIR}/deepeeRTConfig.cmake" + INSTALL_DESTINATION + ${DEEPEERT_CMAKE_INSTALL_DESTINATION} +) + +message("DEEPEERT_VERSION ${DEEPEERT_VERSION}") +write_basic_package_version_file( + "deepeeRTConfigVersion.cmake" + VERSION ${DEEPEERT_VERSION} + COMPATIBILITY SameMajorVersion +) + +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/deepeeRTConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/deepeeRTConfigVersion.cmake + DESTINATION + ${DEEPEERT_CMAKE_INSTALL_DESTINATION} + ) + + +install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/cmake + DESTINATION + ${DEEPEERT_CMAKE_INSTALL_DESTINATION} + FILES_MATCHING + PATTERN *.cmake + PATTERN FinddeepeeRT.cmake EXCLUDE +) + +install( + TARGETS + deepeeRT +# EXPORT deepeeRT + EXPORT deepeeRT-config + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + NAMELINK_SKIP + # on Windows put the dlls into bin + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + # ... and the import lib into the devel package + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} +) +install(EXPORT deepeeRT-config +#install(EXPORT deepeeRT + DESTINATION ${DEEPEERT_CMAKE_INSTALL_DESTINATION} + NAMESPACE deepeeRT:: + ) + +# configures the public api +configure_file( + ${PROJECT_SOURCE_DIR}/include/deepeeRT/deepeeRT.in.h + include/deepeeRT/deepeeRT.h +) + +install( + DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/deepeeRT + DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR} +) diff --git a/dp/Backend.cpp b/dp/Backend.cpp index 1b8eb7d..cc258c3 100644 --- a/dp/Backend.cpp +++ b/dp/Backend.cpp @@ -5,10 +5,4 @@ #include "dp/Context.h" namespace dp { - - Backend::Backend(Context *const context) - : context(context), - gpuID(context->gpuID) - {} - } // ::dp diff --git a/dp/Backend.h b/dp/Backend.h index c27d0fe..27a8c58 100644 --- a/dp/Backend.h +++ b/dp/Backend.h @@ -10,7 +10,9 @@ namespace dp { struct TrianglesDPGroup; struct InstancesDPGroup; - /*! CAREFUL: this HAS to match the data layout of DPRRay in deepee.h !*/ + /*! Internal representation of a ray in the input ray + queue. CAREFUL: this HAS to match the data layout of DPRRay in + deepee.h !*/ struct Ray { vec3d origin; vec3d direction; @@ -18,14 +20,18 @@ namespace dp { double tMax; }; - /*! CAREFUL: this HAS to match the data layout of DPRHit in deepee.h !*/ + /*! Internal representation of a hit in the hit queue to be traced + against. CAREFUL: this HAS to match the data layout of DPRHit in + deepee.h !*/ struct Hit { /*! index of prim within the geometry it was created in. A value of '-1' means 'no hit' */ int primID; + /* index of the instance that contained the hit point. Undefined if on hit occurred */ int instID; + /*! user-supplied geom ID (the one specified during geometry create call) for the geometry that contained the hit. Unlike primID and instID this is *not* a linear ID, but whatever int64 value the @@ -35,41 +41,6 @@ namespace dp { double u, v; }; - /*! implements a group of double-precision triangles */ - struct TrianglesDPImpl { - TrianglesDPImpl(TrianglesDPGroup *const fe) : fe(fe) {} - virtual ~TrianglesDPImpl() = default; - TrianglesDPGroup *const fe; - }; - - /*! implements a group of double-precision instances, including the - actual trace() method */ - struct InstancesDPImpl { - InstancesDPImpl(InstancesDPGroup *const fe) : fe(fe) {} - virtual ~InstancesDPImpl() = default; - - virtual void trace(Ray *rays, - Hit *hits, - int numRays) = 0; - - InstancesDPGroup *const fe; - }; - /*! implements an actual backend for a double-precision ray tracing - context. primarily acts as 'factory' for instance and geometry - groups that then do the actual work */ - struct Backend { - Backend(Context *const context); - virtual ~Backend() = default; - - virtual std::shared_ptr - createInstancesDPImpl(dp::InstancesDPGroup *fe) = 0; - - virtual std::shared_ptr - createTrianglesDPImpl(dp::TrianglesDPGroup *fe) = 0; - - Context *const context; - int const gpuID; - }; - + static Context *createBackend(int gpuID); } // ::dp diff --git a/dp/CMakeLists.txt b/dp/CMakeLists.txt index 5708f61..62df336 100644 --- a/dp/CMakeLists.txt +++ b/dp/CMakeLists.txt @@ -3,36 +3,52 @@ find_package(CUDAToolkit REQUIRED) -add_library(deepeeRT_common STATIC - Backend.h - Backend.cpp +add_library(deepeeRT_common INTERFACE#STATIC ) -target_include_directories(deepeeRT_common PUBLIC - ${PROJECT_SOURCE_DIR} - $ +target_include_directories(deepeeRT_common INTERFACE + $ + $/include +# $ ) -target_link_libraries(deepeeRT_common PUBLIC +target_link_libraries(deepeeRT_common INTERFACE#PUBLIC cuBQL CUDA::cuda_driver CUDA::cudart_static ) +target_compile_definitions(deepeeRT_common INTERFACE#PUBLIC + -DdeepeeRT_STATIC=1) add_library(deepeeRT STATIC - ../include/deepee/deepee.h + ../include/deepeeRT/deepeeRT.h + Backend.h + Backend.cpp common.h Triangles.h - Triangles.cu + Triangles.cpp World.h - World.cu + World.cpp + Group.h + Group.cpp Context.h Context.cpp - Group.h - Group.cu deepeeRT.cpp ) +target_include_directories(deepeeRT PUBLIC + $ + ) add_subdirectory(cuBQL) -target_link_libraries(deepeeRT PUBLIC - deepeeRT_common - deepeeRT_cuBQL +target_link_libraries(deepeeRT PRIVATE#PUBLIC + $ + $ +) + +set_target_properties(deepeeRT + PROPERTIES + CXX_VISIBILITY_PRESET hidden + CUDA_VISIBILITY_PRESET hidden + CUDA_SEPARABLE_COMPILATION ON + POSITION_INDEPENDENT_CODE ON + CUDA_USE_STATIC_CUDA_RUNTIME ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON ) diff --git a/dp/Context.cpp b/dp/Context.cpp index 27bc978..b62eaf8 100644 --- a/dp/Context.cpp +++ b/dp/Context.cpp @@ -8,9 +8,7 @@ namespace dp { Context::Context(int gpuID) : gpuID(gpuID) - { - backend = std::make_shared(this); - } - + {} + } // ::dp diff --git a/dp/Context.h b/dp/Context.h index 5605bb5..9a13515 100644 --- a/dp/Context.h +++ b/dp/Context.h @@ -5,13 +5,42 @@ #include "dp/common.h" #include "dp/Backend.h" +#include namespace dp { + + struct InstanceGroup; + struct TriangleMesh; + struct TrianglesGroup; struct Context { + static Context *create(int gpuID); + Context(int gpuID); - std::shared_ptr backend; + /*! creates a 'world' as a grouping of triangle mesh groups, with + associated object-to-world space instance + transforms. Implements the dprTrace() API function */ + virtual dp::InstanceGroup * + createInstanceGroup(const std::vector &groups, + const DPRAffine *transforms) = 0; + + /*! creates an object that represents a single triangle + mesh. implements `dprCreateTrianglesDP()` */ + virtual dp::TriangleMesh * + createTriangleMesh(uint64_t userData, + const vec3d *vertexArray, + int vertexCount, + const vec3i *indexArray, + int indexCount) = 0; + + /*! creates an object that represents a group of multiple triangle + meshes that can then get instantiated. implements + `dprCreateTrianglesGroup()` */ + virtual dp::TrianglesGroup * + createTrianglesGroup(const std::vector &geoms) = 0; + + /*! the cuda gpu ID that this device is going to run on */ int const gpuID; }; diff --git a/dp/Group.cpp b/dp/Group.cpp new file mode 100644 index 0000000..5ded672 --- /dev/null +++ b/dp/Group.cpp @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "dp/Group.h" +#include "dp/Context.h" + +namespace dp { + + Group::Group(Context *const context) + : context(context) + {} + +} // ::dp + diff --git a/dp/Group.cu b/dp/Group.cu deleted file mode 100644 index d1a35a4..0000000 --- a/dp/Group.cu +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "dp/Group.h" -#include "dp/Context.h" - -namespace dp { - - TrianglesDPGroup::TrianglesDPGroup(Context *context, - const std::vector &geoms) - : context(context), - geoms(geoms) - { - impl = context->backend->createTrianglesDPImpl(this); - } - -} // ::dp - diff --git a/dp/Group.h b/dp/Group.h index 78d63fa..47b6808 100644 --- a/dp/Group.h +++ b/dp/Group.h @@ -3,31 +3,22 @@ #pragma once -#include "dp/Triangles.h" +#include "dp/common.h" namespace dp { struct Context; - struct TrianglesDPImpl; - /*! allows for referencing a specific primitive within a specific - geometry within multiple geometries that a group may be built - over */ - struct PrimRef { - int geomID; - int primID; - }; - + /*! abstract base class of a group of one or more intersectable + things that share an acceleration structures. will be subclassed + into InstancesGroup and TrianglesGroup, and then implemented in + each backend based on how this backend works */ struct Group { - }; - - struct TrianglesDPGroup : public Group { - TrianglesDPGroup(Context *context, - const std::vector &geoms); - - std::vector geoms; + Group(Context *const context); + virtual ~Group() = default; + + /*! context that this group was created in */ Context *const context; - std::shared_ptr impl; }; } // ::dp diff --git a/dp/Triangles.cpp b/dp/Triangles.cpp new file mode 100644 index 0000000..a5733ed --- /dev/null +++ b/dp/Triangles.cpp @@ -0,0 +1,32 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "dp/Triangles.h" +#include "dp/Context.h" + +namespace dp { + TriangleMesh::TriangleMesh(Context *context, + uint64_t userData, + const vec3d *_vertexArray, + int vertexCount, + const vec3i *_indexArray, + int indexCount) + : userData(userData), + context(context) + { + /* iw - note this class will NOT store any pointers to host data, + it's the job of the derived class(es) to sture data as, if, and + where required*/ + } + + TrianglesGroup::TrianglesGroup(Context *context, + const std::vector &geoms) + : Group(context) + { + /* iw - note this class will NOT store any pointers to host data, + it's the job of the derived class(es) to sture data as, if, and + where required*/ + } + +} // ::dp + diff --git a/dp/Triangles.cu b/dp/Triangles.cu deleted file mode 100644 index 4781a01..0000000 --- a/dp/Triangles.cu +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "dp/Triangles.h" -#include "dp/Context.h" - -namespace dp { - TrianglesDP::TrianglesDP(Context *context, - uint64_t userData, - const vec3d *_vertexArray, - int vertexCount, - const vec3i *_indexArray, - int indexCount) - : userData(userData), - vertexCount(vertexCount), - indexCount(indexCount), - context(context) - { - cudaMalloc((void**)&vertexArray,vertexCount*sizeof(vec3d)); - cudaMemcpy((void*)vertexArray,_vertexArray, - vertexCount*sizeof(vec3d),cudaMemcpyDefault); - cudaMalloc((void**)&indexArray,indexCount*sizeof(vec3i)); - cudaMemcpy((void*)indexArray,_indexArray, - indexCount*sizeof(vec3i),cudaMemcpyDefault); - } - - TrianglesDP::~TrianglesDP() - { - cudaFree((void*)indexArray); - cudaFree((void*)vertexArray); - } - -} // ::dp - diff --git a/dp/Triangles.h b/dp/Triangles.h index 052148d..b66bbfa 100644 --- a/dp/Triangles.h +++ b/dp/Triangles.h @@ -4,29 +4,50 @@ #pragma once #include "dp/common.h" +#include "dp/Group.h" namespace dp { struct Context; - /*! a mesh of triangles, for a dp context, with vertices in doubles. This calss will make a *copy* of the input arrays */ - struct TrianglesDP { - TrianglesDP(Context *context, - uint64_t userData, - const vec3d *vertexArray, - int vertexCount, - const vec3i *indexArray, - int indexCount); - virtual ~TrianglesDP(); - + /*! (virtual base class for) a mesh of triangles, for a dp context, + with vertices in doubles. implementations of this class may + create copies of the input arays on either host and/or device */ + struct TriangleMesh { + TriangleMesh(Context *context, + uint64_t userData, + const vec3d *vertexArray, + int vertexCount, + const vec3i *indexArray, + int indexCount); + virtual ~TriangleMesh() = default; + + /* iw - note this base class will NOT store any pointers to host + data, it's the job of the derived class(es) to sture data as, + if, and where required*/ uint64_t const userData = 0; - const vec3d *const vertexArray = 0; - const vec3i *const indexArray = 0; - int const vertexCount = 0; - int const indexCount = 0; Context *const context; }; + /*! allows for referencing a specific primitive within a specific + geometry within multiple geometries that a group may be built + over */ + struct PrimRef { + int geomID; + int primID; + }; + + /*! a "group" of one or more triangle meshes, including the + acceleration structure to trace a ray against those triangles */ + struct TrianglesGroup : public Group { + TrianglesGroup(Context *context, + const std::vector &geoms); + + /* iw - note this base class will NOT store any pointers to host + data, it's the job of the derived class(es) to sture data as, + if, and where required*/ + }; + } // ::dp diff --git a/dp/World.cpp b/dp/World.cpp new file mode 100644 index 0000000..238e202 --- /dev/null +++ b/dp/World.cpp @@ -0,0 +1,10 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "dp/World.h" +#include "dp/Group.h" +#include "dp/Context.h" + +namespace dp { +} // ::dp + diff --git a/dp/World.cu b/dp/World.cu deleted file mode 100644 index 7b976f2..0000000 --- a/dp/World.cu +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "dp/World.h" -#include "dp/Group.h" -#include "dp/Context.h" - -namespace dp { - - InstancesDPGroup::InstancesDPGroup(Context *context, - const std::vector &groups, - const DPRAffine *d_transforms) - : context(context), - groups(groups), - d_transforms(d_transforms) - { - impl = context->backend->createInstancesDPImpl(this); - } - - void InstancesDPGroup::traceRays(DPRRay *d_rays, DPRHit *d_hits, int numRays) - { - CUBQL_CUDA_SYNC_CHECK(); - if (!isDevicePointer(d_rays) || - !isDevicePointer(d_hits)) - throw std::runtime_error("the rays[] and hits[] arrays passed to dpTrace (currently?) have to point to device memory."); - impl->trace((Ray*)d_rays, - (Hit*)d_hits, - numRays); - } - -} // ::dp - diff --git a/dp/World.h b/dp/World.h index f46a1df..d173cb3 100644 --- a/dp/World.h +++ b/dp/World.h @@ -1,28 +1,34 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "dp/Triangles.h" +#include "dp/Group.h" namespace dp { struct Context; - struct Group; - - struct InstancesDPImpl; - - struct InstancesDPGroup { - InstancesDPGroup(Context *context, - const std::vector &groups, - const DPRAffine *d_transforms); - - void traceRays(DPRRay *d_rays, DPRHit *d_hits, int numRays); - - std::vector const groups; - const DPRAffine *const d_transforms; - Context *const context; - std::shared_ptr impl; + struct TrianglesGroup; + + /*! a group of double precision instances; each instance is defined + by a affine transforms and TrianglesDPGroup that it refers to */ + struct InstanceGroup : public Group { + InstanceGroup(Context *context, + const std::vector &groups, + const DPRAffine *transforms) + : Group(context) + {} + + /*! implements dprTrace() */ + virtual void traceRays(DPRRay *d_rays, + DPRHit *d_hits, + int numRays, + uint64_t flags) = 0; + + /* iw - note this base class will NOT store any pointers to host + data, it's the job of the derived class(es) to store data as, + if, and where required*/ }; } // ::dp diff --git a/dp/common.h b/dp/common.h index e2132be..c6a5eff 100644 --- a/dp/common.h +++ b/dp/common.h @@ -3,7 +3,7 @@ #pragma once -#include "deepee/deepee.h" +#include "deepeeRT/deepeeRT.h" #include "cuBQL/math/box.h" #include "cuBQL/math/linear.h" #include diff --git a/dp/cuBQL/CMakeLists.txt b/dp/cuBQL/CMakeLists.txt index 9cc012c..70c096f 100644 --- a/dp/cuBQL/CMakeLists.txt +++ b/dp/cuBQL/CMakeLists.txt @@ -1,14 +1,19 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -add_library(deepeeRT_cuBQL STATIC +add_library(deepeeRT_cuBQL OBJECT CuBQLBackend.h - CuBQLBackend.cu + CuBQLBackend.cu + Triangles.h + Triangles.cu + InstanceGroup.h + InstanceGroup.cu ) -target_link_libraries(deepeeRT_cuBQL PUBLIC +target_link_libraries(deepeeRT_cuBQL + PUBLIC deepeeRT_common cuBQL -# $ - $ + # do not link to cuBQL_cuda_double3_static; we explicitly + # instantiate builder(s) in CubQLBackend.cu ) diff --git a/dp/cuBQL/CuBQLBackend.cu b/dp/cuBQL/CuBQLBackend.cu index 8db7220..3fbf7f6 100644 --- a/dp/cuBQL/CuBQLBackend.cu +++ b/dp/cuBQL/CuBQLBackend.cu @@ -1,305 +1,83 @@ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +#define CUBQL_GPU_BUILDER_IMPLEMENTATION 1 +#define CUBQL_CPU_BUILDER_IMPLEMENTATION 1 + #include "dp/cuBQL/CuBQLBackend.h" -#include "dp/Context.h" -#include "dp/Group.h" -#include "dp/World.h" +#include "dp/cuBQL/Triangles.h" +#include "dp/cuBQL/InstanceGroup.h" + +namespace cuBQL { + namespace cpu { + template + void spatialMedian(BinaryBVH &bvh, + const box_t *boxes, + uint32_t numPrims, + BuildConfig buildConfig); + template + void freeBVH(BinaryBVH &bvh); + + } + namespace cuda { + template + void sahBuilder(BinaryBVH &bvh, + const box_t *boxes, + uint32_t numPrims, + BuildConfig buildConfig, + cudaStream_t s, + GpuMemoryResource &memResource); + + template + void free(BinaryBVH &bvh, + cudaStream_t s, + GpuMemoryResource& memResource); + } +} -#include -#include -#include -#include -#include namespace dp { namespace cubql_cuda { - using namespace ::cuBQL; - - using bvh3d = bvh_t; - - using TriangleDP = cuBQL::triangle_t; - using RayTriangleIntersection = cuBQL::RayTriangleIntersection_t; - - /*! an array that can upload an array from host to device, and free - on destruction. If the pointer provided is *already* a device - pointer this will just use that pointer */ - template - struct AutoUploadArray { - AutoUploadArray(const T *elements, int count) - { - this->count = count; - if (isDevicePointer(elements)) { - this->elements = elements; - this->needsCudaFree = false; - } else { - cudaMalloc((void **)&this->elements,count*sizeof(T)); - cudaMemcpy((void*)this->elements,elements,count*sizeof(T), - cudaMemcpyDefault); - this->needsCudaFree = true; - } - } - - ~AutoUploadArray() { if (needsCudaFree) cudaFree((void*)elements); } - - const T *elements = 0; - int count = 0; - bool needsCudaFree = false; - }; - - /*! triangle mesh representation on the device */ - struct DevMesh { - const vec3d *vertices; - const vec3i *indices; - uint64_t userData; - }; - - struct HostMesh { - HostMesh(uint64_t userData, - const vec3d *verticesArray, int verticesCount, - const vec3i *indicesArray, int indicesCount) - : userData(userData), - vertices(verticesArray,verticesCount), - indices(indicesArray,indicesCount) - {} - - DevMesh getDD() const - { return { vertices.elements, indices.elements, userData }; } - - AutoUploadArray vertices; - AutoUploadArray indices; - uint64_t const userData; - }; - - struct TrianglesDP : public dp::TrianglesDPImpl { - TrianglesDP(CuBQLCUDABackend *be, - dp::TrianglesDPGroup *fe); - virtual ~TrianglesDP(); - - struct DevGroup { - inline __cubql_both TriangleDP getTriangle(PrimRef prim) const; - - bvh3d bvh; - DevMesh *meshes; - PrimRef *primRefs; - }; - - DevGroup getDevGroup() const - { return { bvh,meshes,primRefs }; } - - bvh3d bvh; - // DevGroup *group = nullptr; - DevMesh *meshes = nullptr; - PrimRef *primRefs = nullptr; - - /*! these are stored and owned on the host, and also manage their - vertex arrays' ownerhip; but vertex arrays themselves will be - device accessible */ - std::vector> hostMeshes; - - CuBQLCUDABackend *const be; - }; - - inline __cubql_both - TriangleDP TrianglesDP::DevGroup::getTriangle(PrimRef prim) const + CuBQLCUDABackend::CuBQLCUDABackend(int gpuID) + : Context(gpuID) { - DevMesh mesh = meshes[prim.geomID]; - vec3i idx = mesh.indices[prim.primID]; - return { mesh.vertices[idx.x],mesh.vertices[idx.y],mesh.vertices[idx.z] }; + SetActiveGPU forDuration(gpuID); + cudaFree(0); } - - __global__ - void generateTriangleInputs(int meshID, - PrimRef *primRefs, - box3d *primBounds, - int numTrisThisMesh, - DevMesh mesh) - { - int tid = threadIdx.x+blockIdx.x*blockDim.x; - if (tid >= numTrisThisMesh) return; - vec3i idx = mesh.indices[tid]; - box3d bb; - bb.extend(mesh.vertices[idx.x]); - bb.extend(mesh.vertices[idx.y]); - bb.extend(mesh.vertices[idx.z]); - primRefs[tid] = { meshID, tid }; - primBounds[tid] = bb; - } - - TrianglesDP::TrianglesDP(CuBQLCUDABackend *be, - dp::TrianglesDPGroup *fe) - : TrianglesDPImpl(fe), be(be) - { - SetActiveGPU forDuration(be->gpuID); - - int numTrisTotal = 0; - std::vector devMeshes; - for (auto geom : fe->geoms) { - numTrisTotal += geom->indexCount; - // this will automatically upload the vertex arrays if so required: - auto hm = std::make_shared - (geom->userData, - geom->vertexArray,geom->vertexCount, - geom->indexArray,geom->indexCount); - hostMeshes.push_back(hm); - devMeshes.push_back(hm->getDD()); - } - cudaMalloc((void **)&meshes,devMeshes.size()*sizeof(DevMesh)); - cudaMemcpy((void*)meshes,devMeshes.data(), - devMeshes.size()*sizeof(DevMesh),cudaMemcpyDefault); - - cudaMalloc((void **)&primRefs,numTrisTotal*sizeof(*primRefs)); - - box3d *primBounds = nullptr; - cudaMalloc((void **)&primBounds,numTrisTotal*sizeof(*primBounds)); - - int offset = 0; - for (int meshID=0;meshID<(int)hostMeshes.size();meshID++) { - auto &hm = hostMeshes[meshID]; - int count = hm->indices.count; - int bs = 128; - int nb = divRoundUp(count,bs); - generateTriangleInputs<<>>(meshID, - primRefs+offset, - primBounds+offset, - count, - hm->getDD()); - offset += count; - } - cudaStreamSynchronize(0); - - std::cout << "#dpr: building BVH over " << prettyNumber(numTrisTotal) - << " triangles" << std::endl; - CUBQL_CUDA_SYNC_CHECK(); - DeviceMemoryResource memResource; -#if 0 - ::cuBQL::gpuBuilder(bvh, - primBounds, - numTrisTotal, - ::cuBQL::BuildConfig(), - 0, - memResource); -#else - ::cuBQL::cuda::sahBuilder(bvh, - primBounds, - numTrisTotal, - ::cuBQL::BuildConfig(), - 0, - memResource); -#endif - std::cout << "#dpr: ... bvh built." << std::endl; - - cudaFree(primBounds); - CUBQL_CUDA_SYNC_CHECK(); - } - - TrianglesDP::~TrianglesDP() + dp::InstanceGroup * + CuBQLCUDABackend:: + createInstanceGroup(const std::vector &groups, + const DPRAffine *transforms) { - cudaFree(meshes); - cudaFree(primRefs); - ::cuBQL::cuda::free(bvh); + return new InstanceGroup(this, groups,(const affine3d*)transforms); } - __global__ void g_trace(TrianglesDP::DevGroup group, - Ray *rays, - Hit *hits, - int numRays) + dp::TriangleMesh * + CuBQLCUDABackend:: + createTriangleMesh(uint64_t userData, + const vec3d *vertexArray, + int vertexCount, + const vec3i *indexArray, + int indexCount) { - int tid = threadIdx.x+blockIdx.x*blockDim.x; - if (tid >= numRays) return; - -#ifdef NDEBUG - const bool dbg = false; -#else - bool dbg = (tid == -1); -#endif - - - Hit hit = hits[tid]; - hit.primID = -1; - int instID = 0; - ::cuBQL::ray3d ray(rays[tid].origin, - rays[tid].direction, - rays[tid].tMin, - rays[tid].tMax); - - if (dbg) { - cuBQL::dout << "dbg ray " << ray << "\n"; - cuBQL::dout << "bvh.nodes " << (int*)group.bvh.nodes << "\n"; - cuBQL::dout << "bvh.primIDs " << (int*)group.bvh.primIDs << "\n"; - cuBQL::dout << "group.meshes " << (int*)group.meshes << "\n"; - cuBQL::dout << "group.mesh0 " << group.meshes[0].userData << "\n"; - cuBQL::dout << "group.primRefs " << (int*)group.primRefs << "\n"; - } - auto intersectPrim = [&ray,&hit,group,instID,dbg](uint32_t primID) -> double { - RayTriangleIntersection isec; - PrimRef prim = group.primRefs[primID]; - const TriangleDP tri = group.getTriangle(prim); - if (isec.compute(ray,tri)) { - hit.primID = prim.primID; - hit.instID = instID; - hit.geomUserData = group.meshes[prim.geomID].userData; - hit.t = isec.t; - if (dbg) printf("hit %i %i\n",hit.instID,hit.primID); - ray.tMax = isec.t; - } - return ray.tMax; - }; - ::cuBQL::shrinkingRayQuery::forEachPrim(intersectPrim,group.bvh,ray); - hits[tid] = hit;; + return new TriangleMesh(this, + userData, + vertexArray, + vertexCount, + indexArray, + indexCount); } - - - struct InstancesDP : public dp::InstancesDPImpl { - InstancesDP(CuBQLCUDABackend *be, - dp::InstancesDPGroup *fe) - : InstancesDPImpl(fe), be(be) - {} - void trace(Ray *rays, - Hit *hits, - int numRays) override; - CuBQLCUDABackend *const be; - }; - - void InstancesDP::trace(Ray *rays, - Hit *hits, - int numRays) + + dp::TrianglesGroup * + CuBQLCUDABackend:: + createTrianglesGroup(const std::vector &geoms) { - CUBQL_CUDA_SYNC_CHECK(); - assert(fe->groups.size() == 1); - assert(fe->d_transforms == nullptr); - TrianglesDPGroup *tg = (TrianglesDPGroup *)fe->groups[0]; - assert(tg); - TrianglesDP *triangles = (TrianglesDP*)tg->impl.get(); - assert(triangles); - - int bs = 128; - int nb = divRoundUp(numRays,bs); - g_trace<<>>(triangles->getDevGroup(), - rays,hits, - numRays); - CUBQL_CUDA_SYNC_CHECK(); + return new TrianglesGroup(this,geoms); } - } // :: cubql_cuda - CuBQLCUDABackend::CuBQLCUDABackend(Context *const context) - : Backend(context) - { - SetActiveGPU forDuration(context->gpuID); - cudaFree(0); - } - - std::shared_ptr - CuBQLCUDABackend::createInstancesDPImpl(dp::InstancesDPGroup *fe) - { return std::make_shared(this,fe); } - - std::shared_ptr - CuBQLCUDABackend::createTrianglesDPImpl(dp::TrianglesDPGroup *fe) - { return std::make_shared(this,fe); } - } diff --git a/dp/cuBQL/CuBQLBackend.h b/dp/cuBQL/CuBQLBackend.h index 962a43a..7b2eda4 100644 --- a/dp/cuBQL/CuBQLBackend.h +++ b/dp/cuBQL/CuBQLBackend.h @@ -3,24 +3,103 @@ #pragma once -#include "dp/Backend.h" -#include "dp/Group.h" +#include "dp/Context.h" #include +#include +#include +#include +#include +#include +#include +#include namespace dp { - - struct CuBQLCUDABackend : public dp::Backend - { - CuBQLCUDABackend(Context *const context); - virtual ~CuBQLCUDABackend() = default; + namespace cubql_cuda { + + using namespace ::cuBQL; - virtual std::shared_ptr - createInstancesDPImpl(dp::InstancesDPGroup *fe) override; + using bvh3d = bvh_t; + using TriangleDP = cuBQL::triangle_t; + using RayTriangleIntersection = cuBQL::RayTriangleIntersection_t; + using cuBQL::affine3d; + + /*! an array that can upload an array from host to device, and free + on destruction. If the pointer provided is *already* a device + pointer this will just use that pointer */ + template + struct AutoUploadArray { + AutoUploadArray() = default; + AutoUploadArray(const T *elements, size_t count); + AutoUploadArray(const AutoUploadArray &other) = delete; + ~AutoUploadArray(); + + // move operator + AutoUploadArray &operator=(AutoUploadArray &&other); + const T *elements = 0; + size_t count = 0; + bool needsCudaFree = false; + }; + + struct CuBQLCUDABackend : public dp::Context + { + CuBQLCUDABackend(int gpuID); + virtual ~CuBQLCUDABackend() = default; + + dp::InstanceGroup * + createInstanceGroup(const std::vector &groups, + const DPRAffine *transforms) override; - virtual std::shared_ptr - createTrianglesDPImpl(dp::TrianglesDPGroup *fe) override; - }; + dp::TriangleMesh * + createTriangleMesh(uint64_t userData, + const vec3d *vertexArray, + int vertexCount, + const vec3i *indexArray, + int indexCount) override; + + dp::TrianglesGroup * + createTrianglesGroup(const std::vector &geoms) override; + }; + +#ifdef __CUDACC__ + // ================================================================== + // INLINE IMPLEMENTATION SECTION + // ================================================================== + template inline + AutoUploadArray::AutoUploadArray(const T *elements, + size_t count) + { + this->count = count; + // if (isDevicePointer(elements)) { + // this->elements = elements; + // this->needsCudaFree = false; + // } else { + // iw - for now, ALWAYS create a copy + CUBQL_CUDA_SYNC_CHECK(); + cudaMalloc((void **)&this->elements,count*sizeof(T)); + cudaMemcpy((void*)this->elements,elements,count*sizeof(T), + cudaMemcpyDefault); + this->needsCudaFree = true; + CUBQL_CUDA_SYNC_CHECK(); + } + template inline + AutoUploadArray & + AutoUploadArray::operator=(AutoUploadArray &&other) + { + elements = other.elements; other.elements = 0; + count = other.count; other.count = 0; + needsCudaFree = other.needsCudaFree; other.needsCudaFree = 0; + return *this; + } + + template inline + AutoUploadArray::~AutoUploadArray() { + if (needsCudaFree) cudaFree((void*)elements); + CUBQL_CUDA_SYNC_CHECK(); + } +#endif + + } } diff --git a/dp/cuBQL/Group.cu b/dp/cuBQL/Group.cu new file mode 100644 index 0000000..f32d571 --- /dev/null +++ b/dp/cuBQL/Group.cu @@ -0,0 +1,212 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "dp/cuBQL/CuBQLBackend.h" + +namespace dp { + namespace cubql_cuda { + __global__ + void g_traceFirstGroupOnly(TrianglesDP::DevGroup group, + Ray *rays, + Hit *hits, + int numRays) + { + int tid = threadIdx.x+blockIdx.x*blockDim.x; + if (tid >= numRays) return; + +#ifdef NDEBUG + const bool dbg = false; +#else + bool dbg = (tid == -1); +#endif + + Hit hit = hits[tid]; + hit.primID = -1; + int instID = 0; + ::cuBQL::ray3d ray(rays[tid].origin, + rays[tid].direction, + rays[tid].tMin, + rays[tid].tMax); + + if (dbg) { + cuBQL::dout << "dbg ray " << ray << "\n"; + cuBQL::dout << "bvh.nodes " << (int*)group.bvh.nodes << "\n"; + cuBQL::dout << "bvh.primIDs " << (int*)group.bvh.primIDs << "\n"; + cuBQL::dout << "group.meshes " << (int*)group.meshes << "\n"; + cuBQL::dout << "group.mesh0 " << group.meshes[0].userData << "\n"; + cuBQL::dout << "group.primRefs " << (int*)group.primRefs << "\n"; + } + auto intersectPrim = [&ray,&hit,group,instID,dbg](uint32_t primID) + -> double + { + RayTriangleIntersection isec; + PrimRef prim = group.primRefs[primID]; + const TriangleDP tri = group.getTriangle(prim); + if (isec.compute(ray,tri)) { + hit.primID = prim.primID; + hit.instID = instID; + hit.geomUserData = group.meshes[prim.geomID].userData; + hit.t = isec.t; + if (dbg) printf("hit %i %i\n",hit.instID,hit.primID); + ray.tMax = isec.t; + } + return ray.tMax; + }; + ::cuBQL::shrinkingRayQuery::forEachPrim(intersectPrim,group.bvh,ray); + hits[tid] = hit;; + } + + + + + + __global__ + void g_traceWorld(/*! the device data for the instancegroup itself */ + InstanceGroup::DD model, + /*! the list of instance transforms */ + const DPRAffine *const d_transforms, + /*! the list of instantiated groups */ + const TrianglesDP::DevGroup *d_instantiatedGroups, + Ray *rays, + Hit *hits, + int numRays) + { + int tid = threadIdx.x+blockIdx.x*blockDim.x; + if (tid >= numRays) return; + +#ifdef NDEBUG + const bool dbg = false; +#else + bool dbg = (tid == -1); +#endif + + Hit hit = hits[tid]; + hit.primID = -1; + hit.instID = -1; + struct { + int instID = -1; + TrianglesDP::Group group; + ::cuBQL::ray3d ray; + } object; + ::cuBQL::ray3d worldRay(rays[tid].origin, + rays[tid].direction, + rays[tid].tMin, + rays[tid].tMax); + + auto intersectPrim = [&ray,&hit,&object,instID,dbg](uint32_t primID) + -> double + { + RayTriangleIntersection isec; + PrimRef prim = object.group.primRefs[primID]; + const TriangleDP tri = group.getTriangle(prim); + if (isec.compute(ray,tri)) { + hit.primID = prim.primID; + hit.instID = object.instID; + hit.geomUserData = group.meshes[prim.geomID].userData; + hit.t = isec.t; + if (dbg) printf("hit %i %i\n",hit.instID,hit.primID); + ray.tMax = isec.t; + } + return ray.tMax; + }; + auto enterBlas = [this,model,¤t] + (cuBQL::ray3f &out_ray, + cuBQL::bvh3f &out_bvh, + int instID) + { + current.group = d_instantiatedGroups[instID]; + current.instID = instID; + object.ray = world.ray; + if (!isUnitTransform(currentInstance->worldToObjectXfm)) { + object.ray.origin + = xfmPoint(currentInstance->worldToObjectXfm,world.origin); + object.ray.direction + = xfmVector(currentInstance->worldToObjectXfm,world.direction); + } + out_bvh = {0,0,0,0}; + out_bvh.nodes = object.group.bvhNodes; + }; + auto leaveBlas = [this]() -> void { + /* nothing to do */ + }; + + ::cuBQL::shrinkingRayQuery::twoLevel::forEachPrim + (enterBlas,leaveBlas,intersectPrim,model->bvh,ray); + + hits[tid] = hit; + } + + + struct InstancesDP : public dp::InstancesDPImpl { + InstancesDP(CuBQLCUDABackend *be, + dp::InstancesDPGroup *fe) + : InstancesDPImpl(fe), be(be) + { + int numInstances = fe->instances.size(); + if (numInstances == 0) return; + + std::vector instancedGroups; + for (auto feGroup : fe->groups) { + dp::TrianglesDPGroup *group = feGroup + instancedGroups.push_back(inst + } + cudaMalloc((void **)&d_instancedGroups, + numInstances*sizeof(*d_triangleGroups)); + cudaMemcpy((void*)d_instancedGroups,instancedGroups.data(), + numInstances*sizeof(*d_triangleGroups), + cudaMemcpyDefault); + } + void trace(Ray *rays, + Hit *hits, + int numRays) override; + + TrianglesDP::DevGroup *d_instancedGroups = 0; + CuBQLCUDABackend *const be; + }; + + void InstancesDP::trace(Ray *rays, + Hit *hits, + int numRays) + { + CUBQL_CUDA_SYNC_CHECK(); + int bs = 128; + int nb = divRoundUp(numRays,bs); +#if 0 + assert(fe->groups.size() == 1); + assert(fe->d_transforms == nullptr); + TrianglesDPGroup *tg = (TrianglesDPGroup *)fe->groups[0]; + assert(tg); + TrianglesDP *triangles = (TrianglesDP*)tg->impl.get(); + assert(triangles); + + g_traceFirstGroupOnly<<>>(triangles->getDevGroup(), + rays,hits, + numRays); +#else + g_traceInstances<<>>(triangles->getDevGroup(), + rays,hits, + numRays); +#endif + CUBQL_CUDA_SYNC_CHECK(); + } + + } // :: cubql_cuda + + CuBQLCUDABackend::CuBQLCUDABackend(Context *const context) + : Backend(context) + { + SetActiveGPU forDuration(context->gpuID); + cudaFree(0); + } + + std::shared_ptr + CuBQLCUDABackend::createInstancesDPImpl(dp::InstancesDPGroup *fe) + { return std::make_shared(this,fe); } + + std::shared_ptr + CuBQLCUDABackend::createTrianglesDPImpl(dp::TrianglesDPGroup *fe) + { return std::make_shared(this,fe); } + +} + + diff --git a/dp/cuBQL/InstanceGroup.cu b/dp/cuBQL/InstanceGroup.cu new file mode 100644 index 0000000..4405f4a --- /dev/null +++ b/dp/cuBQL/InstanceGroup.cu @@ -0,0 +1,223 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "dp/cuBQL/InstanceGroup.h" +#include "dp/cuBQL/Triangles.h" + +namespace dp { + namespace cubql_cuda { + + __global__ + void g_prepareInstances(int numInstances, + InstanceGroup::InstancedObjectDD *instances, + bool hasTransforms, + affine3d *worldToObjectXfms, + affine3d *objectToWorldXfms, + box3d *d_instBounds) + { + int tid = threadIdx.x+blockIdx.x*blockDim.x; + if (tid >= numInstances) return; + affine3d xfm; + if (!hasTransforms) { + xfm = affine3d(); + objectToWorldXfms[tid] = xfm; + } else { + xfm = objectToWorldXfms[tid]; + } + worldToObjectXfms[tid] = rcp(xfm); + instances[tid].hasXfm = (xfm != affine3d()); + + box3d objBounds = instances[tid].group.bvh.nodes[0].bounds; + vec3d b0 = objBounds.lower; + vec3d b1 = objBounds.upper; + box3d instBounds; + instBounds.extend(xfmPoint(xfm,vec3d(b0.x,b0.y,b0.z))); + instBounds.extend(xfmPoint(xfm,vec3d(b0.x,b0.y,b1.z))); + instBounds.extend(xfmPoint(xfm,vec3d(b0.x,b1.y,b0.z))); + instBounds.extend(xfmPoint(xfm,vec3d(b0.x,b1.y,b1.z))); + instBounds.extend(xfmPoint(xfm,vec3d(b1.x,b0.y,b0.z))); + instBounds.extend(xfmPoint(xfm,vec3d(b1.x,b0.y,b1.z))); + instBounds.extend(xfmPoint(xfm,vec3d(b1.x,b1.y,b0.z))); + instBounds.extend(xfmPoint(xfm,vec3d(b1.x,b1.y,b1.z))); + d_instBounds[tid] = instBounds; + } + + InstanceGroup::InstanceGroup(Context *context, + const std::vector &groups, + const affine3d *transforms) + : dp::InstanceGroup(context,groups, + (const DPRAffine *)transforms), + numInstances((int)groups.size()) + { + CUBQL_CUDA_SYNC_CHECK(); + assert(numInstances > 0); + std::vector instanceDDs; + for (auto _group : groups) { + InstancedObjectDD instance; + TrianglesGroup *group = (TrianglesGroup*)_group; + instance.group = group->getDD(); + instanceDDs.push_back(instance); + } + + cudaMalloc((void**)&d_instanceDDs, + numInstances*sizeof(*d_instanceDDs)); + cudaMemcpy(d_instanceDDs, + instanceDDs.data(), + numInstances*sizeof(*d_instanceDDs), + cudaMemcpyDefault); + + cudaMalloc((void**)&d_worldToObjectXfms, + numInstances*sizeof(affine3d)); + cudaMalloc((void**)&d_objectToWorldXfms, + numInstances*sizeof(affine3d)); + if (transforms) + cudaMemcpy(d_objectToWorldXfms, + transforms, + numInstances*sizeof(affine3d), + cudaMemcpyDefault); + box3d *d_instBounds = 0; + cudaMalloc((void**)&d_instBounds, + numInstances*sizeof(box3d)); + g_prepareInstances + <<>> + (numInstances, + d_instanceDDs, + transforms != 0, + d_worldToObjectXfms, + d_objectToWorldXfms, + d_instBounds); + CUBQL_CUDA_SYNC_CHECK(); + + DeviceMemoryResource memResource; + ::cuBQL::BuildConfig buildConfig; + buildConfig.maxAllowedLeafSize = 1; + std::cout << "==================================================================" << std::endl; + PING; + std::cout << "TOP" << std::endl; + ::cuBQL::cuda::sahBuilder(bvh, + d_instBounds, + numInstances, + buildConfig, + 0, + memResource); + + CUBQL_CUDA_SYNC_CHECK(); + PING; + cudaFree(d_instBounds); + } + + InstanceGroup::~InstanceGroup() + { + cudaFree(d_instanceDDs); + cudaFree(d_objectToWorldXfms); + cudaFree(d_worldToObjectXfms); + } + + InstanceGroup::DD InstanceGroup::getDD() const + { + return { d_instanceDDs, d_worldToObjectXfms, bvh }; + } + + __global__ + void g_traceRays(/*! the device data for the instancegroup itself */ + InstanceGroup::DD world, + DPRRay *rays, + DPRHit *hits, + int numRays, + uint64_t flags) + { + int tid = threadIdx.x+blockIdx.x*blockDim.x; + if (tid >= numRays) return; + +#ifdef NDEBUG + const bool dbg = false; +#else + bool dbg = false;//(tid == 512*1024+512); +#endif + + DPRHit hit = hits[tid]; + hit.primID = -1; + hit.instID = -1; + hit.t = 1e30; + struct ObjectSpaceTravState { + int instID = -1; + InstanceGroup::InstancedObjectDD instance; + ::cuBQL::ray3d ray; + } objectSpace; + ::cuBQL::ray3d worldRay((const vec3d&)rays[tid].origin, + (const vec3d&)rays[tid].direction, + rays[tid].tMin, + rays[tid].tMax); + + auto intersectPrim + = [&hit,&worldRay,&objectSpace,flags,dbg](uint32_t primID) + -> double + { + RayTriangleIntersection isec; + auto &group = objectSpace.instance.group; + PrimRef prim = group.primRefs[primID]; + const TriangleDP tri = group.getTriangle(prim); + + auto getNormal = [tri]() { return cross(tri.b-tri.a,tri.c-tri.a); }; + bool culled = false; + if (flags & DPR_CULL_FRONT) + culled |= (dot(getNormal(),objectSpace.ray.direction) <= 0.); + if (flags & DPR_CULL_BACK) + culled |= (dot(getNormal(),objectSpace.ray.direction) >= 0.); + if (!culled && isec.compute(objectSpace.ray,tri)) { + hit.primID = prim.primID; + hit.instID = objectSpace.instID; + hit.geomUserData = group.meshes[prim.geomID].userData; + hit.t = isec.t; + worldRay.tMax = isec.t; + } + return worldRay.tMax; + }; + auto enterBlas = [world,worldRay,&objectSpace,dbg] + (cuBQL::ray3d &out_ray, + cuBQL::bvh3d &out_bvh, + int instID) + { + objectSpace.instance = world.instancedGroups[instID]; + objectSpace.instID = instID; + objectSpace.ray = worldRay; + if (objectSpace.instance.hasXfm) { + affine3d worldToObjectXfm = world.worldToObjectXfms[instID]; + objectSpace.ray.origin + = xfmPoint(worldToObjectXfm,worldRay.origin); + objectSpace.ray.direction + = xfmVector(worldToObjectXfm,worldRay.direction); + } + out_ray = objectSpace.ray; + if (dbg) dout << "out ray " << out_ray << "\n"; + out_bvh = objectSpace.instance.group.bvh; + // out_bvh.nodes = objectSpace.instance.group.bvh.nodes; + }; + auto leaveBlas = []() -> void { + /* nothing to do */ + }; + + ::cuBQL::shrinkingRayQuery::twoLevel::forEachPrim + (enterBlas,leaveBlas,intersectPrim,world.bvh,worldRay,dbg); + + hits[tid] = hit; + } + + + void InstanceGroup::traceRays(DPRRay *d_rays, + DPRHit *d_hits, + int numRays, + uint64_t flags) + { + int bs = 128; + int nb = divRoundUp(numRays,bs); + g_traceRays<<>>(getDD(), + d_rays,d_hits,numRays, + flags); + cudaDeviceSynchronize(); + } + + } +} + diff --git a/dp/cuBQL/InstanceGroup.h b/dp/cuBQL/InstanceGroup.h new file mode 100644 index 0000000..e4c1225 --- /dev/null +++ b/dp/cuBQL/InstanceGroup.h @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "dp/cuBQL/CuBQLBackend.h" +#include "dp/cuBQL/Triangles.h" +#include "dp/World.h" + +namespace dp { + namespace cubql_cuda { + + /*! a single triangle mesh; can be created over pointes that are + either on host or device, but which definitively stores + vertices on the device */ + struct InstanceGroup : public dp::InstanceGroup { + struct InstancedObjectDD { + TrianglesGroup::DD group; + bool hasXfm; + }; + struct DD { + const InstancedObjectDD *instancedGroups; + const affine3d *worldToObjectXfms; + bvh3d bvh; + }; + + InstanceGroup(Context *context, + const std::vector &groups, + const affine3d *transforms); + ~InstanceGroup() override; + + DD getDD() const; + + void traceRays(DPRRay *d_rays, + DPRHit *d_hits, + int numRays, + uint64_t flags) override; + + int numInstances = 0; + InstancedObjectDD *d_instanceDDs = 0; + affine3d *d_worldToObjectXfms = 0; + affine3d *d_objectToWorldXfms = 0; + bvh3d bvh; + }; + + } +} diff --git a/dp/cuBQL/Triangles.cu b/dp/cuBQL/Triangles.cu new file mode 100644 index 0000000..998a850 --- /dev/null +++ b/dp/cuBQL/Triangles.cu @@ -0,0 +1,119 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "dp/cuBQL/Triangles.h" + +namespace dp { + namespace cubql_cuda { + + /*! triangle mesh representation on the device */ + __global__ + void generateTriangleInputs(int meshID, + PrimRef *primRefs, + box3d *primBounds, + int numTrisThisMesh, + TriangleMesh::DD mesh) + { + int tid = threadIdx.x+blockIdx.x*blockDim.x; + if (tid >= numTrisThisMesh) return; + + vec3i idx = mesh.indices[tid]; + box3d bb; + bb.extend(mesh.vertices[idx.x]); + bb.extend(mesh.vertices[idx.y]); + bb.extend(mesh.vertices[idx.z]); + + primRefs[tid] = { meshID, tid }; + primBounds[tid] = bb; + } + + TriangleMesh::TriangleMesh(Context *context, + uint64_t userData, + const vec3d *vertexArray, + int vertexCount, + const vec3i *indexArray, + int indexCount) + : dp::TriangleMesh(context,userData, + vertexArray,vertexCount, + indexArray,indexCount), + vertices(vertexArray,vertexCount), + indices(indexArray,indexCount) + {} + + TrianglesGroup::TrianglesGroup(Context *context, + const std::vector &meshes) + : dp::TrianglesGroup(context,meshes) + { + CUBQL_CUDA_SYNC_CHECK(); + SetActiveGPU forDuration(context->gpuID); + + int numTrisTotal = 0; + std::vector devMeshes; + for (auto _geom : meshes) { + TriangleMesh *geom = (TriangleMesh*)_geom; + devMeshes.push_back(geom->getDD()); + numTrisTotal += geom->indices.count; + } + cudaMalloc((void **)&d_meshDDs, + devMeshes.size()*sizeof(TriangleMesh::DD)); + cudaMemcpy((void*)d_meshDDs,devMeshes.data(), + devMeshes.size()*sizeof(TriangleMesh::DD),cudaMemcpyDefault); + + cudaMalloc((void **)&d_primRefs,numTrisTotal*sizeof(*d_primRefs)); + + box3d *d_primBounds = nullptr; + cudaMalloc((void **)&d_primBounds,numTrisTotal*sizeof(*d_primBounds)); + + int offset = 0; + for (int meshID=0;meshID<(int)meshes.size();meshID++) { + TriangleMesh *mesh = (TriangleMesh *)meshes[meshID]; + int count = mesh->indices.count; + int bs = 128; + int nb = divRoundUp(count,bs); + generateTriangleInputs<<>>(meshID, + d_primRefs+offset, + d_primBounds+offset, + count, + mesh->getDD()); + offset += count; + } + cudaStreamSynchronize(0); + + // std::cout << "#dpr: building BVH over " << prettyNumber(numTrisTotal) + // << " triangles" << std::endl; + CUBQL_CUDA_SYNC_CHECK(); + DeviceMemoryResource memResource; +#if 0 + ::cuBQL::gpuBuilder(bvh, + d_primBounds, + numTrisTotal, + ::cuBQL::BuildConfig(), + 0, + memResource); +#else + ::cuBQL::cuda::sahBuilder(bvh, + d_primBounds, + numTrisTotal, + ::cuBQL::BuildConfig(), + 0, + memResource); +#endif + // std::cout << "#dpr: ... bvh built." << std::endl; + + cudaFree(d_primBounds); + CUBQL_CUDA_SYNC_CHECK(); + } + + TrianglesGroup::~TrianglesGroup() + { + CUBQL_CUDA_SYNC_CHECK(); + cudaFree(d_meshDDs); + cudaFree(d_primRefs); + ::cuBQL::cuda::free(bvh); + CUBQL_CUDA_SYNC_CHECK(); + } + + } +} + + diff --git a/dp/cuBQL/Triangles.h b/dp/cuBQL/Triangles.h new file mode 100644 index 0000000..cf56357 --- /dev/null +++ b/dp/cuBQL/Triangles.h @@ -0,0 +1,89 @@ +#pragma once + +#include "dp/cuBQL/CuBQLBackend.h" +#include "dp/Triangles.h" + +namespace dp { + namespace cubql_cuda { + + /*! a single triangle mesh; can be created over pointes that are + either on host or device, but which definitively stores + vertices on the device */ + struct TriangleMesh : public dp::TriangleMesh { + struct DD { + inline __cubql_both TriangleDP getTriangle(uint32_t primID) const; + + const vec3d *vertices; + const vec3i *indices; + uint64_t userData; + }; + + TriangleMesh(Context *context, + uint64_t userData, + const vec3d *vertexArray, + int vertexCount, + const vec3i *indexArray, + int indexCount); + + DD getDD() const + { return { vertices.elements, indices.elements, userData }; } + + AutoUploadArray vertices; + AutoUploadArray indices; + }; + + + /*! a group/acceleration structure over one or more triangle meshes */ + struct TrianglesGroup : public dp::TrianglesGroup { + /*! device data for a cubql group over one or more triangle + meshes */ + struct DD { + /*! return the triangle specified by the given primref */ + inline __cubql_both DD() = default; + inline __cubql_both TriangleDP getTriangle(PrimRef prim) const; + + TriangleMesh::DD *meshes; + PrimRef *primRefs; + bvh3d bvh; + }; + + TrianglesGroup(Context *context, + const std::vector &geoms); + ~TrianglesGroup() override; + + + DD getDD() const + { + DD dd; + dd.meshes = d_meshDDs; + dd.primRefs = d_primRefs; + dd.bvh = bvh; + return dd; + } + + bvh3d bvh; + TriangleMesh::DD *d_meshDDs; + PrimRef *d_primRefs; + }; + + inline __cubql_both + TriangleDP TriangleMesh::DD::getTriangle(uint32_t primID) const + { + vec3i idx = indices[primID]; + TriangleDP tri; + tri.a = vertices[idx.x]; + tri.b = vertices[idx.y]; + tri.c = vertices[idx.z]; + return tri; + } + + inline __cubql_both + TriangleDP TrianglesGroup::DD::getTriangle(PrimRef prim) const + { + const TriangleMesh::DD &mesh = meshes[prim.geomID]; + TriangleDP tri = mesh.getTriangle(prim.primID); + return tri; + } + + } +} diff --git a/dp/deepeeRT.cpp b/dp/deepeeRT.cpp index 1cd8454..7624ba8 100644 --- a/dp/deepeeRT.cpp +++ b/dp/deepeeRT.cpp @@ -7,15 +7,22 @@ #include "dp/Triangles.h" #include "dp/Group.h" #include "dp/World.h" +#include "dp/cuBQL/CuBQLBackend.h" namespace dp { + + Context *Context::create(int gpuID) + { + return new cubql_cuda::CuBQLCUDABackend(gpuID); + }; + } // ::dp DPR_API DPRContext dprContextCreate(DPRContextType contextType, int gpuToUse) { - return (DPRContext)new dp::Context(gpuToUse); + return (DPRContext)dp::Context::create(gpuToUse); } DPR_API @@ -27,20 +34,20 @@ DPRTriangles dprCreateTrianglesDP(DPRContext _context, yielded the intersection. */ uint64_t userData, /*! device array of vertices */ - DPRvec3 *d_vertexArray, + DPRvec3 *vertexArray, size_t vertexCount, /*! device array of int3 vertex indices */ - DPRint3 *d_indexArray, + DPRint3 *indexArray, size_t indexCount) { dp::Context *context = (dp::Context *)_context; assert(context); - return (DPRTriangles)new dp::TrianglesDP(context, - userData, - (const dp::vec3d*)d_vertexArray, - vertexCount, - (const dp::vec3i*)d_indexArray, - indexCount); + return (DPRTriangles)context-> + createTriangleMesh(userData, + (const dp::vec3d*)vertexArray, + vertexCount, + (const dp::vec3i*)indexArray, + indexCount); } DPR_API @@ -50,36 +57,33 @@ DPRGroup dprCreateTrianglesGroup(DPRContext _context, { dp::Context *context = (dp::Context *)_context; assert(context); - std::vector geoms; + std::vector geoms; for (int i=0;i<(int)triangleGeomsCount;i++) { - dp::TrianglesDP *geom = (dp::TrianglesDP *)triangleGeomsArray[i]; + dp::TriangleMesh *geom = (dp::TriangleMesh *)triangleGeomsArray[i]; assert(geom); assert(geom->context == context); geoms.push_back(geom); } - return (DPRGroup)new dp::TrianglesDPGroup(context,geoms); + return (DPRGroup)context->createTrianglesGroup(geoms); } - DPR_API DPRWorld dprCreateWorldDP(DPRContext _context, DPRGroup *instanceGroups, - DPRAffine *d_instanceTransforms, + DPRAffine *instanceTransforms, size_t instanceCount) { dp::Context *context = (dp::Context *)_context; assert(context); - assert(instanceCount == 1 && "instancing not yet implemented"); - assert(d_instanceTransforms == nullptr && "instancing not yet implemented"); - - std::vector groups; + std::vector groups; for (int i=0;i<(int)instanceCount;i++) { - dp::Group *group = (dp::Group *)instanceGroups[i]; + dp::TrianglesGroup *group = (dp::TrianglesGroup *)instanceGroups[i]; assert(group); groups.push_back(group); } - return (DPRWorld)new dp::InstancesDPGroup(context,groups,d_instanceTransforms); + return (DPRWorld)context-> + createInstanceGroup(groups,instanceTransforms); } DPR_API @@ -91,15 +95,41 @@ void dprTrace(/*! the world we want the rays to be traced against */ DPRHit *d_hits, /*! number of rays that need tracing. d_rays and d_hits *must* have (at least) that many entires */ - int numRays) + int numRays, + uint64_t flags) { - dp::InstancesDPGroup *world = (dp::InstancesDPGroup *)_world; + dp::InstanceGroup *world = (dp::InstanceGroup *)_world; assert(world); assert(d_hits); assert(d_rays); assert(numRays > 0); - world->traceRays(d_rays,d_hits,numRays); + world->traceRays(d_rays,d_hits,numRays,flags); +} + +DPR_API void dprFreeWorld(DPRWorld world) +{ + assert(world); + delete (dp::InstanceGroup *)world; +} + +DPR_API void dprFreeTriangles(DPRTriangles triangles) +{ + assert(triangles); + delete (dp::TriangleMesh *)triangles; +} + +DPR_API void dprFreeGroup(DPRGroup group) +{ + assert(group); + delete (dp::TrianglesGroup *)group; +} + +DPR_API void dprFreeContext(DPRContext context) +{ + assert(context); + delete (dp::Context *)context; } + diff --git a/dp/deepeeRTConfig.cmake.in b/dp/deepeeRTConfig.cmake.in new file mode 100644 index 0000000..ac740ad --- /dev/null +++ b/dp/deepeeRTConfig.cmake.in @@ -0,0 +1,42 @@ +## SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +## SPDX-License-Identifier: Apache-2.0 + +# originally copied-then-adapted-from the following: + +## Copyright 2021-2024 The Khronos Group +## SPDX-License-Identifier: 2-Apache.0 + +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) + +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_dependency(Threads) + +include(${CMAKE_CURRENT_LIST_DIR}/deepeeRT-config.cmake) + +if (NOT TARGET deepeeRT::deepeeRT) + message(FATAL_ERROR "CMAKE_PREFIX_PATH or deepeeRT_DIR are pointing to a \ + deepeeRT build directory. Please do a full install of deepeeRT \ + (e.g. 'make install') and point to where you installed it \ + (CMAKE_INSTALL_PREFIX in your build of deepeeRT). \ + Consuming deepeeRT from a build directory is not supported.") +endif() +set(deepeeRT_FOUND ON) + +set(DEEPEERT_DATAROOTDIR + ${CMAKE_CURRENT_LIST_DIR}/../../../@CMAKE_INSTALL_DATAROOTDIR@/deepeeRT + ) + +foreach(component ${deepeeRT_FIND_COMPONENTS}) + # For requested component, execute its "config" script + message("cfind component ${component}") + if (EXISTS "${CMAKE_CURRENT_LIST_DIR}/${component}-config.cmake") + include(${CMAKE_CURRENT_LIST_DIR}/${component}-config.cmake) + set(${component}_FOUND ON) + else() + set(${component}_FOUND OFF) + endif() +endforeach() + +check_required_components(@PROJECT_NAME@) diff --git a/include/deepee/deepee.h b/include/deepeeRT/deepeeRT.in.h similarity index 71% rename from include/deepee/deepee.h rename to include/deepeeRT/deepeeRT.in.h index 1d62127..3c290eb 100644 --- a/include/deepee/deepee.h +++ b/include/deepeeRT/deepeeRT.in.h @@ -44,6 +44,11 @@ #include #include +#define DEEPEERT_VERSION_MAJOR @DEEPEERT_VERSION_MAJOR@ +#define DEEPEERT_VERSION_MINOR @DEEPEERT_VERSION_MINOR@ +#define DEEPEERT_VERSION_PATCH @DEEPEERT_VERSION_PATCH@ + + #ifdef _WIN32 // for now, we only support a static build for this library # if 1 || defined(deepeeRT_STATIC) @@ -77,6 +82,14 @@ typedef struct _DPRContext *DPRContext; typedef enum { DPR_CONTEXT_GPU } DPRContextType; +#define DPR_FLAGS_NONE (uint64_t(0)) +/*! if enabled, we will skip all intersections with triangles whose + normal faces TOWARDS the origin (ie, cull iff dot(ray.dir,N)<0) */ +#define DPR_CULL_FRONT (uint64_t(1ull<<0)) +/*! if enabled, we will skip all intersections with triangles whose + normal faces AWAY the origin (ie, cull iff dot(ray.dir,N)>0) */ +#define DPR_CULL_BACK (uint64_t(1ull<<1)) + struct DPRint3 { int32_t x,y,z; }; struct DPRvec3 { double x,y,z; }; struct DPRvec4 { double x,y,z,w; }; @@ -113,7 +126,10 @@ struct DPRHit { instID this is *not* a linear ID, but whatever int64 value the user specified there. */ uint64_t geomUserData; - double tHit; + union { + double tHit; // << old (pre v0.2) spelling + double t; + }; double u, v; }; @@ -142,17 +158,29 @@ DPRTriangles dprCreateTrianglesDP(DPRContext context, DPRint3 *indexArray, size_t indexCount); +/*! create an object representing a group of one or more triangle + meshes that can then get instantiated (dpr never directly + instantiates individual triangle meshes, but always groups of + meshes. If you need to instantiate a single mesh you need to first + create a TrianglesGroup with that single mesh, then instantiate + this). */ DPR_API DPRGroup dprCreateTrianglesGroup(DPRContext, DPRTriangles *triangleGeomsArray, size_t triangleGeomsCount); +/*! creates a world over one or more triangle mesh groups; each + instance is defined by a handle to the group it wants to + instantaite, plus an associated transform that represents the + object-to-world transform supposed to be applied to this + geometry. */ DPR_API DPRWorld dprCreateWorldDP(DPRContext, DPRGroup *instanceGroups, DPRAffine *instanceTransforms, size_t instanceCount); +/*! traces a set of rays against a previously computed world. */ DPR_API void dprTrace(/*! the world we want the rays to be traced against */ DPRWorld world, @@ -162,11 +190,33 @@ void dprTrace(/*! the world we want the rays to be traced against */ DPRHit *d_hits, /*! number of rays that need tracing. d_rays and d_hits *must* have (at least) that many entires */ - int numRays); + int numRays, + uint64_t flags = 0ull); +/*! frees a previously created world. This should also free all the + memory that this world object has created for internal + acceleration structures, but will NOT free the groups that it was + created over. It is user's job to free those appropriately */ DPR_API void dprFreeWorld(DPRWorld world); + +/*! frees a previously created triangle mesh. This should also free + all the memory that this group has created for internal storage of + the triangles it was create over. Once freed world objects created + over this triangle mesh (or over objects created over this + triangle mesh) are no longer valid and may no longer get traced + against. */ DPR_API void dprFreeTriangles(DPRTriangles triangles); + +/*! frees a previously created triangle mesh group. This should also + free all the memory that this group has created for internal + acceleration structures, but will NOT free the triangle meshes + that it was created over. It is user's job to free those + appropriately. Once freed world objects created over this group + are no longer valid and may no longer get traced against. */ DPR_API void dprFreeGroup(DPRGroup group); + +/*! frees the root context. This is currently NOT guaranteed to free + all the objects created within this context. */ DPR_API void dprFreeContext(DPRContext context); diff --git a/miniapp/CMakeLists.txt b/miniapp/CMakeLists.txt index a0aec97..995bc27 100644 --- a/miniapp/CMakeLists.txt +++ b/miniapp/CMakeLists.txt @@ -11,3 +11,13 @@ add_executable(dpMiniApp ) target_link_libraries(dpMiniApp PUBLIC deepeeRT) +add_executable(dpInstanceMiniApp + DGEF.h + DGEF.cpp + import_OBJ.cpp + Camera.h + Camera.cu + instanceMiniApp.cu +) +target_link_libraries(dpInstanceMiniApp PUBLIC deepeeRT) + diff --git a/miniapp/Camera.cu b/miniapp/Camera.cu index 5e72d80..d67f74f 100644 --- a/miniapp/Camera.cu +++ b/miniapp/Camera.cu @@ -23,7 +23,7 @@ namespace miniapp { dv *= scale; du *= scale*aspect; - + #if 0 // for testing: this is a ortho camera with parallel rays and // different origins all n a plane diff --git a/miniapp/Camera.h b/miniapp/Camera.h index 6183aa6..85cf3c0 100644 --- a/miniapp/Camera.h +++ b/miniapp/Camera.h @@ -24,7 +24,8 @@ namespace miniapp { /*! generate ray for a given pixel/image plane coordinate. based on how the camera was created this could be either a orthogonal or a perspective camera (see Camera.cpp) */ - inline __device__ Ray generateRay(vec2d pixel, bool dbg=false) const; + inline __device__ + Ray generateRay(vec2d pixel, bool dbg=false) const; struct { vec3d v,du,dv; @@ -45,6 +46,12 @@ namespace miniapp { Ray ray; ray.origin = origin.v+pixel.x*origin.du+pixel.y*origin.dv; + if (dbg) + printf("camera org %f %f %f\n", + (float)origin.v.x, + (float)origin.v.y, + (float)origin.v.z); + ray.direction = normalize(direction.v+pixel.x*direction.du+pixel.y*direction.dv); ray.tMin = 0.; diff --git a/miniapp/DGEF.cpp b/miniapp/DGEF.cpp new file mode 100644 index 0000000..249bb55 --- /dev/null +++ b/miniapp/DGEF.cpp @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "DGEF.h" +#include + +namespace miniapp { + namespace dgef { + + Scene *Scene::load(const std::string &fileName) + { + std::ifstream in(fileName.c_str(),std::ios::binary); + + std::vector meshes; + Scene *scene = new Scene; + + size_t header; + in.read((char*)&header,sizeof(header)); + + bool isProperDGEF = (header == 0xdefdefdefULL); + size_t numMeshes; + in.read((char*)&numMeshes,sizeof(numMeshes)); + for (int meshID=0;meshIDmeshes = meshes; + Instance *inst = new Instance; + inst->xfm = affine3d(); + inst->object = object; + scene->instances.push_back(inst); + return scene; + } + + size_t numObjects; + in.read((char*)&numObjects,sizeof(numObjects)); + int meshBegin = 0; + std::vector objects; + for (int i=0;imeshes.push_back(meshes[meshBegin++]); + objects.push_back(object); + } + + size_t numInstances; + in.read((char*)&numInstances,sizeof(numInstances)); + for (int instID=0;instIDxfm,sizeof(inst->xfm)); + int objectID; + in.read((char*)&objectID,sizeof(objectID)); + inst->object = objects[objectID]; + scene->instances.push_back(inst); + } + return scene; + } + + box3d Scene::bounds() const + { + std::cout << "#dpm: computing scene bounds" << std::endl; + box3d bounds; + for (auto inst : instances) + for (auto m : inst->object->meshes) + for (auto v : m->vertices) + bounds.extend(xfmPoint(inst->xfm,v)); + std::cout << "#dpm: bounds = " << bounds << std::endl; + return bounds; + } + + } +} + diff --git a/miniapp/DGEF.h b/miniapp/DGEF.h new file mode 100644 index 0000000..3e741d1 --- /dev/null +++ b/miniapp/DGEF.h @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "deepeeRT/deepeeRT.h" +#include "cuBQL/math/box.h" +#include "cuBQL/math/linear.h" +#include "cuBQL/math/affine.h" +#include + +#include +#include +#include +#include +#include + +namespace miniapp { + using namespace cuBQL; + namespace dgef { + struct Mesh { + std::vector vertices; + std::vector indices; + }; + struct Object { + std::vector meshes; + }; + struct Instance { + affine3d xfm; + Object *object; + }; + struct Scene { + static Scene *load(const std::string &fileName); + std::vector instances; + box3d bounds() const; + }; + } +} diff --git a/miniapp/Mesh.cpp b/miniapp/Mesh.cpp index d7ac0b4..1161e8f 100644 --- a/miniapp/Mesh.cpp +++ b/miniapp/Mesh.cpp @@ -4,9 +4,11 @@ #include "Mesh.h" #include #include +#include namespace miniapp { - + using cuBQL::affine3d; + box3d Mesh::bounds() { box3d bb; @@ -55,6 +57,8 @@ namespace miniapp { load_obj(fileName); else if (ext == ".binmesh") load_binmesh(fileName); + else if (ext == ".dgef") + load_dgef(fileName); else throw std::runtime_error("un-recognized or un-supported file extension '"+ext+"'"); } @@ -80,6 +84,52 @@ namespace miniapp { in.read((char*)indices.data(),numIndices*sizeof(indices[0])); } + + void Mesh::load_dgef(const std::string &fileName) + { + vertices.clear(); + indices.clear(); + + std::ifstream in(fileName.c_str(),std::ios::binary); + + size_t header; + in.read((char*)&header,sizeof(header)); + + size_t numMeshes; + in.read((char*)&numMeshes,sizeof(numMeshes)); + std::vector meshes(numMeshes); + for (int meshID=0;meshIDvertices = meshes[0].vertices; + this->indices = meshes[0].indices; + } + + void Mesh::translate(vec3d delta) { for (auto &v : vertices) diff --git a/miniapp/Mesh.h b/miniapp/Mesh.h index 1010c17..33cb92d 100644 --- a/miniapp/Mesh.h +++ b/miniapp/Mesh.h @@ -1,9 +1,10 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA +// CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "deepee/deepee.h" +#include "deepeeRT/deepeeRT.h" #include "cuBQL/math/box.h" #include "cuBQL/math/linear.h" #include @@ -29,6 +30,7 @@ namespace miniapp { private: void load_binmesh(const std::string &fileName); void load_obj(const std::string &fileName); + void load_dgef(const std::string &fileName); }; /*! helper function that creates a mesh with a terrain-like shape, diff --git a/miniapp/instanceMiniApp.cu b/miniapp/instanceMiniApp.cu new file mode 100644 index 0000000..786249d --- /dev/null +++ b/miniapp/instanceMiniApp.cu @@ -0,0 +1,250 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "Camera.h" +#include "DGEF.h" +#include + +namespace miniapp { + + /*! helper function that creates a semi-random color from an ID */ + inline __cubql_both vec3f randomColor(int i) + { + const uint64_t FNV_offset_basis = 0xcbf29ce484222325ULL; + const uint64_t FNV_prime = 0x10001a7; + uint32_t v = (uint32_t)FNV_offset_basis; + v = FNV_prime * v ^ i; + v = FNV_prime * v ^ i; + v = FNV_prime * v ^ i; + v = FNV_prime * v ^ i; + + int r = v >> 24; + v = FNV_prime * v ^ i; + int b = v >> 16; + v = FNV_prime * v ^ i; + int g = v >> 8; + return vec3f((r&255)/255.f, + (g&255)/255.f, + (b&255)/255.f); + } + + void getFrame(std::string up, + vec3d &dx, + vec3d &dy, + vec3d &dz) + { + if (up == "z") { + dx = {1.,0.,0.}; + dy = {0.,1.,0.}; + dz = {0.,0.,1.}; + return; + } + if (up == "y") { + dx = {1.,0.,0.}; + dz = {0.,1.,0.}; + dy = {0.,0.,1.}; + return; + } + throw std::runtime_error("unhandled 'up'-specifier of '"+up+"'"); + } + + DPRWorld createWorld(DPRContext context, + dgef::Scene *scene) + { + std::map objects; + CUBQL_CUDA_SYNC_CHECK(); + + for (auto inst : scene->instances) + objects[inst->object] = 0; + + std::cout << "#dpm: creating " << objects.size() << " objects" << std::endl; + int meshID = 0; + for (auto &pairs : objects) { + auto obj = pairs.first; + std::vector geoms; + for (auto pm : obj->meshes) { + std::cout << "#dpm: creating dpr triangle mesh w/ " + << prettyNumber(pm->indices.size()) << " triangles" + << std::endl; + DPRTriangles geom + = dprCreateTrianglesDP(context, + meshID++, + (DPRvec3*)pm->vertices.data(), + pm->vertices.size(), + (DPRint3*)pm->indices.data(), + pm->indices.size()); + CUBQL_CUDA_SYNC_CHECK(); + geoms.push_back(geom); + } + CUBQL_CUDA_SYNC_CHECK(); + + DPRGroup group = dprCreateTrianglesGroup(context, + geoms.data(), + geoms.size()); + objects[obj] = group; + } + CUBQL_CUDA_SYNC_CHECK(); + + std::cout << "#dpm: creating dpr world" << std::endl; + std::vector xfms; + std::vector groups; + for (auto inst : scene->instances) { + xfms.push_back(inst->xfm); + groups.push_back(objects[inst->object]); + } + DPRWorld world = dprCreateWorldDP(context, + groups.data(), + (DPRAffine*)xfms.data(), + groups.size()); + CUBQL_CUDA_SYNC_CHECK(); + return world; + } + + + __global__ + void g_shadeRays(vec4f *d_pixels, + DPRRay *d_rays, + DPRHit *d_hits, + vec2i fbSize) + { + int ix = threadIdx.x+blockIdx.x*blockDim.x; + int iy = threadIdx.y+blockIdx.y*blockDim.y; + + if (ix >= fbSize.x) return; + if (iy >= fbSize.y) return; + + //Ray ray = (const Ray &)d_rays[ix+iy*fbSize.x]; + DPRHit hit = d_hits[ix+iy*fbSize.x]; + vec3f color = randomColor(hit.primID + 0x290374*hit.geomUserData); + vec4f pixel = {color.x,color.y,color.z,1.f}; + int tid = ix+iy*fbSize.x; + d_pixels[tid] = pixel; + } + + __global__ + void g_generateRays(DPRRay *d_rays, + vec2i fbSize, + const Camera camera) + { + static_assert(sizeof(DPRRay) == sizeof(Ray)); + + int ix = threadIdx.x+blockIdx.x*blockDim.x; + int iy = threadIdx.y+blockIdx.y*blockDim.y; + + if (ix >= fbSize.x) return; + if (iy >= fbSize.y) return; + + double u = ix+.5; + double v = iy+.5; + + bool dbg = false;//ix == 512 && iy == 512; + vec2d pixel = {u,v}; + Ray ray = camera.generateRay(pixel,dbg); + + int rayID = ix+iy*fbSize.x; + if (dbg) + printf("ray %f %f %f : %f %f %f\n", + (float)ray.origin.x, + (float)ray.origin.y, + (float)ray.origin.z, + (float)ray.direction.x, + (float)ray.direction.y, + (float)ray.direction.z); + ((Ray *)d_rays)[rayID] = ray; + } + + void main(int ac, char **av) + { + std::string inFileName; + std::string outFileName = "deepeeTest.ppm"; + vec2i fbSize = { 1024,1024 }; + uint64_t flags = 0; + for (int i=1;ibounds(); + Camera camera = generateCamera(fbSize, + /* bounds to focus on */ + bounds, + /* point we're looking from*/ + length(bounds.size())*vec3d(2,1,4), + /* up for orientation */ + vec3d(0,1,0)); + + vec2i bs(16,16); + vec2i nb = divRoundUp(fbSize,bs); + + std::cout << "#dpm: creating dpr context" << std::endl; + DPRContext dpr = dprContextCreate(DPR_CONTEXT_GPU,0); + std::cout << "#dpm: creating world" << std::endl; + DPRWorld world = createWorld(dpr,scene); + + CUBQL_CUDA_SYNC_CHECK(); + DPRRay *d_rays = 0; + cudaMalloc((void **)&d_rays,fbSize.x*fbSize.y*sizeof(DPRRay)); + CUBQL_CUDA_SYNC_CHECK(); + g_generateRays<<>>(d_rays,fbSize,camera); + CUBQL_CUDA_SYNC_CHECK(); + + DPRHit *d_hits = 0; + cudaMalloc((void **)&d_hits,fbSize.x*fbSize.y*sizeof(DPRHit)); + + CUBQL_CUDA_SYNC_CHECK(); + std::cout << "#dpm: calling trace" << std::endl; + dprTrace(world,d_rays,d_hits,fbSize.x*fbSize.y,flags); + + std::cout << "#dpm: shading rays" << std::endl; + vec4f *m_pixels = 0; + cudaMallocManaged((void **)&m_pixels,fbSize.x*fbSize.y*sizeof(vec4f)); + g_shadeRays<<>>(m_pixels,d_rays,d_hits,fbSize); + cudaStreamSynchronize(0); + + + std::cout << "#dpm: writing test image to " << outFileName << std::endl; + std::ofstream out(outFileName.c_str()); + + char buf[100]; + sprintf(buf,"P3\n#deepee test image\n%i %i 255\n",fbSize.x,fbSize.y); + out << "P3\n"; + out << "#deepeeRT test image\n"; + out << fbSize.x << " " << fbSize.y << " 255" << std::endl; + for (int iy=0;iy &meshes) { + CUBQL_CUDA_SYNC_CHECK(); std::vector geoms; int meshID = 0; for (auto pm : meshes) { @@ -66,18 +67,23 @@ namespace miniapp { pm->vertices.size(), (DPRint3*)d_indices, pm->indices.size()); + CUBQL_CUDA_SYNC_CHECK(); geoms.push_back(geom); } + CUBQL_CUDA_SYNC_CHECK(); std::cout << "#dpm: creating dpr triangles group w/ " << geoms.size() << " meshes" << std::endl; DPRGroup group = dprCreateTrianglesGroup(context, geoms.data(), geoms.size()); + CUBQL_CUDA_SYNC_CHECK(); + std::cout << "#dpm: creating dpr world" << std::endl; DPRWorld world = dprCreateWorldDP(context, &group, nullptr, 1); + CUBQL_CUDA_SYNC_CHECK(); return world; } @@ -118,16 +124,25 @@ namespace miniapp { double u = ix+.5; double v = iy+.5; + bool dbg = false;//ix == 512 && iy == 512; vec2d pixel = {u,v}; - Ray ray = camera.generateRay(pixel,false); + Ray ray = camera.generateRay(pixel,dbg); int rayID = ix+iy*fbSize.x; + if (dbg) + printf("ray %f %f %f : %f %f %f\n", + (float)ray.origin.x, + (float)ray.origin.y, + (float)ray.origin.z, + (float)ray.direction.x, + (float)ray.direction.y, + (float)ray.direction.z); ((Ray *)d_rays)[rayID] = ray; } void main(int ac, char **av) { - double scale = 1; + double scale = 3; std::string up = "y"; std::string inFileName; std::string outFileName = "deepeeTest.ppm"; @@ -167,7 +182,7 @@ namespace miniapp { /* bounds to focus on */ object.bounds(), /* point we're looking from*/ - -1.*scale*(dx+dy)+.5*scale*dz, + -2.*scale*(dx+dy)+scale*dz, /* up for orientation */ dz); diff --git a/submodules/cuBQL b/submodules/cuBQL index d18c5fa..9208c7e 160000 --- a/submodules/cuBQL +++ b/submodules/cuBQL @@ -1 +1 @@ -Subproject commit d18c5fa1a5c98665d13484841ae65774da7751e8 +Subproject commit 9208c7e5055f9c0669eb428e9ad97229ca2b5674