diff --git a/include/rtkCudaDisplacedDetectorImageFilter.hcu b/include/rtkCudaDisplacedDetectorImageFilter.hcu
index b115c9ac1..8c50a12ed 100644
--- a/include/rtkCudaDisplacedDetectorImageFilter.hcu
+++ b/include/rtkCudaDisplacedDetectorImageFilter.hcu
@@ -28,7 +28,7 @@ CUDA_displaced_weight(int     proj_idx_in[3],
                       int     proj_dim_out_buf[2],
                       float * dev_proj_in,
                       float * dev_proj_out,
-                      float * geometries,
+                      float * geometry,
                       float   theta,
                       bool    isPositiveCase,
                       float   proj_orig,
diff --git a/include/rtkCudaFDKWeightProjectionFilter.hcu b/include/rtkCudaFDKWeightProjectionFilter.hcu
index f6284a255..353230ce1 100644
--- a/include/rtkCudaFDKWeightProjectionFilter.hcu
+++ b/include/rtkCudaFDKWeightProjectionFilter.hcu
@@ -26,7 +26,7 @@ CUDA_weight_projection(int     proj_idx[2],
                        int     proj_dim_buf_out[2],
                        float * dev_proj_in,
                        float * dev_proj_out,
-                       float * geometries,
+                       float * geometry,
                        float   proj_orig[2],
                        float   proj_row[2],
                        float   proj_col[2]);
diff --git a/include/rtkCudaParkerShortScanImageFilter.hcu b/include/rtkCudaParkerShortScanImageFilter.hcu
index e39e8783f..aca4c24d1 100644
--- a/include/rtkCudaParkerShortScanImageFilter.hcu
+++ b/include/rtkCudaParkerShortScanImageFilter.hcu
@@ -26,7 +26,7 @@ CUDA_parker_weight(int     proj_idx[2],
                    int     proj_dim_buf_out[2],
                    float * dev_proj_in,
                    float * dev_proj_out,
-                   float * geometries,
+                   float * geometry,
                    float   delta,
                    float   firstAngle,
                    float   proj_orig,
diff --git a/include/rtkCudaUtilities.hcu b/include/rtkCudaUtilities.hcu
index 612a8fda6..737da666e 100644
--- a/include/rtkCudaUtilities.hcu
+++ b/include/rtkCudaUtilities.hcu
@@ -127,14 +127,30 @@ inline __host__ __device__ float
   return u.x * v.x + u.y * v.y + u.z * v.z;
 }
 
+__host__ void
+prepareScalarTextureObject(int                          size[3],
+                           float *                      dev_ptr,
+                           cudaArray *&                 threeDArray,
+                           cudaTextureObject_t &        tex,
+                           const bool                   isProjections,
+                           const bool                   isLinear = true,
+                           const cudaTextureAddressMode texAddressMode = cudaAddressModeBorder);
+
 __host__ void
 prepareVectorTextureObject(int                                size[3],
                            const float *                      dev_ptr,
                            std::vector<cudaArray *> &         componentArrays,
                            const unsigned int                 nComponents,
                            std::vector<cudaTextureObject_t> & tex,
-                           bool                               isProjections);
+                           const bool                         isProjections,
+                           const cudaTextureAddressMode       texAddressMode = cudaAddressModeBorder);
 
+__host__ void
+prepareGeometryTextureObject(int                   size,
+                             const float *         geometry,
+                             float *&              dev_geom,
+                             cudaTextureObject_t & tex_geom,
+                             const unsigned int    nParam);
 
 inline __device__ void
 matrix_matrix_multiply(float * A, float * B, float * C, unsigned int rowsA, unsigned int colsB, unsigned int colsArowsB)
diff --git a/src/rtkCudaDisplacedDetectorImageFilter.cu b/src/rtkCudaDisplacedDetectorImageFilter.cu
index 0d579c79c..b888487b3 100644
--- a/src/rtkCudaDisplacedDetectorImageFilter.cu
+++ b/src/rtkCudaDisplacedDetectorImageFilter.cu
@@ -22,8 +22,6 @@
 #include <cuda_runtime.h>
 #include <math_constants.h>
 
-texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture
-
 inline __device__ float
 TransformIndexToPhysicalPoint(int2 idx, float origin, float row, float column)
 {
@@ -45,19 +43,20 @@ ToUntiltedCoordinateAtIsocenter(float tiltedCoord, float sdd, float sid, float s
 }
 
 __global__ void
-kernel_displaced_weight(int3    proj_idx_in,
-                        int3    proj_size_in,
-                        int2    proj_size_in_buf,
-                        int3    proj_idx_out,
-                        int3    proj_size_out,
-                        int2    proj_size_out_buf,
-                        float * dev_proj_in,
-                        float * dev_proj_out,
-                        float   theta,
-                        bool    isPositiveCase,
-                        float   proj_orig, // projection origin
-                        float   proj_row,  // projection row direction & spacing
-                        float   proj_col   // projection col direction & spacing
+kernel_displaced_weight(int3                proj_idx_in,
+                        int3                proj_size_in,
+                        int2                proj_size_in_buf,
+                        int3                proj_idx_out,
+                        int3                proj_size_out,
+                        int2                proj_size_out_buf,
+                        float *             dev_proj_in,
+                        float *             dev_proj_out,
+                        float               theta,
+                        bool                isPositiveCase,
+                        float               proj_orig, // projection origin
+                        float               proj_row,  // projection row direction & spacing
+                        float               proj_col,  // projection col direction & spacing
+                        cudaTextureObject_t tex_geom   // geometry texture object
 )
 {
   // compute thread index
@@ -90,10 +89,10 @@ kernel_displaced_weight(int3    proj_idx_in,
   {
     float pPoint = TransformIndexToPhysicalPoint(make_int2(pIdx.x, pIdx.y), proj_orig, proj_row, proj_col);
 
-    float sdd = tex1Dfetch(tex_geometry, tIdx.z * 4 + 0);
-    float sx = tex1Dfetch(tex_geometry, tIdx.z * 4 + 1);
-    float px = tex1Dfetch(tex_geometry, tIdx.z * 4 + 2);
-    float sid = tex1Dfetch(tex_geometry, tIdx.z * 4 + 3);
+    float sdd = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 0);
+    float sx = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 1);
+    float px = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 2);
+    float sid = tex1Dfetch<float>(tex_geom, tIdx.z * 4 + 3);
 
     float hyp = sqrtf(sid * sid + sx * sx); // to untilted situation
     float l = ToUntiltedCoordinateAtIsocenter(pPoint, sdd, sid, sx, px, hyp);
@@ -139,7 +138,7 @@ CUDA_displaced_weight(int     proj_idx_in[3],      // overlapping input region i
                       int     proj_dim_out_buf[2], // output size of buffered region
                       float * dev_proj_in,
                       float * dev_proj_out,
-                      float * geometries,
+                      float * geometry,
                       float   theta,
                       bool    isPositiveCase,
                       float   proj_orig,
@@ -147,13 +146,9 @@ CUDA_displaced_weight(int     proj_idx_in[3],      // overlapping input region i
                       float   proj_col)
 {
   // copy geometry matrix to device, bind the matrix to the texture
-  float * dev_geom;
-  cudaMalloc((void **)&dev_geom, proj_dim_out[2] * 4 * sizeof(float));
-  CUDA_CHECK_ERROR;
-  cudaMemcpy(dev_geom, geometries, proj_dim_out[2] * 4 * sizeof(float), cudaMemcpyHostToDevice);
-  CUDA_CHECK_ERROR;
-  cudaBindTexture(0, tex_geometry, dev_geom, proj_dim_out[2] * 4 * sizeof(float));
-  CUDA_CHECK_ERROR;
+  float *             dev_geom;
+  cudaTextureObject_t tex_geom;
+  prepareGeometryTextureObject(proj_dim_out[2], geometry, dev_geom, tex_geom, 4);
 
   // Thread Block Dimensions
   int tBlock_x = 16;
@@ -179,10 +174,11 @@ CUDA_displaced_weight(int     proj_idx_in[3],      // overlapping input region i
                                                  isPositiveCase,
                                                  proj_orig,
                                                  proj_row,
-                                                 proj_col);
+                                                 proj_col,
+                                                 tex_geom);
 
   // Unbind matrix texture
-  cudaUnbindTexture(tex_geometry);
+  cudaDestroyTextureObject(tex_geom);
   CUDA_CHECK_ERROR;
   cudaFree(dev_geom);
   CUDA_CHECK_ERROR;
diff --git a/src/rtkCudaFDKBackProjectionImageFilter.cu b/src/rtkCudaFDKBackProjectionImageFilter.cu
index 4a9d45601..417e7544f 100644
--- a/src/rtkCudaFDKBackProjectionImageFilter.cu
+++ b/src/rtkCudaFDKBackProjectionImageFilter.cu
@@ -39,9 +39,6 @@
  *****************/
 #include <cuda.h>
 
-// T E X T U R E S ////////////////////////////////////////////////////////
-texture<float, cudaTextureType2DLayered> tex_proj;
-
 // Constant memory
 __constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections
 __constant__ int3 c_projSize;
@@ -53,7 +50,7 @@ __constant__ int3 c_vol_size;
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 
 __global__ void
-kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out)
+kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out, cudaTextureObject_t tex_proj)
 {
   itk::SizeValueType i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
   itk::SizeValueType j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
@@ -81,7 +78,7 @@ kernel_fdk_3Dgrid(float * dev_vol_in, float * dev_vol_out)
     ip.y = ip.y * ip.z;
 
     // Get texture point, clip left to GPU, and accumulate in voxel_data
-    voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj) * ip.z * ip.z;
+    voxel_data += tex2DLayered<float>(tex_proj, ip.x, ip.y, proj) * ip.z * ip.z;
   }
 
   // Place it into the volume
@@ -110,33 +107,6 @@ CUDA_reconstruct_conebeam(int     proj_size[3],
   // Copy the projection matrices into constant memory
   cudaMemcpyToSymbol(c_matrices, &(matrices[0]), 12 * sizeof(float) * proj_size[2]);
 
-  // set texture parameters
-  tex_proj.addressMode[0] = cudaAddressModeBorder;
-  tex_proj.addressMode[1] = cudaAddressModeBorder;
-  tex_proj.addressMode[2] = cudaAddressModeBorder;
-  tex_proj.filterMode = cudaFilterModeLinear;
-  tex_proj.normalized = false; // don't access with normalized texture coords
-
-  // Copy projection data to array, bind the array to the texture
-  cudaExtent                   projExtent = make_cudaExtent(proj_size[0], proj_size[1], proj_size[2]);
-  cudaArray *                  array_proj;
-  static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-  CUDA_CHECK_ERROR;
-
-  // Allocate array for input projections, in order to bind them to
-  // a 2D layered texture
-  cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
-  CUDA_CHECK_ERROR;
-
-  // Copy data to 3D array
-  cudaMemcpy3DParms copyParams = cudaMemcpy3DParms();
-  copyParams.srcPtr = make_cudaPitchedPtr(dev_proj, proj_size[0] * sizeof(float), proj_size[0], proj_size[1]);
-  copyParams.dstArray = (cudaArray *)array_proj;
-  copyParams.extent = projExtent;
-  copyParams.kind = cudaMemcpyDeviceToDevice;
-  cudaMemcpy3D(&copyParams);
-  CUDA_CHECK_ERROR;
-
   // Thread Block Dimensions
   constexpr int tBlock_x = 16;
   constexpr int tBlock_y = 4;
@@ -153,19 +123,15 @@ CUDA_reconstruct_conebeam(int     proj_size[3],
   // Compute block and grid sizes
   dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
   dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
-  CUDA_CHECK_ERROR;
-
-  // Bind the array of projections to a 2D layered texture
-  cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
-  CUDA_CHECK_ERROR;
 
-  kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
-
-  // Unbind the image and projection matrix textures
-  cudaUnbindTexture(tex_proj);
-  CUDA_CHECK_ERROR;
+  cudaArray *         array_proj;
+  cudaTextureObject_t tex_proj;
+  prepareScalarTextureObject(proj_size, dev_proj, array_proj, tex_proj, true);
+  kernel_fdk_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out, tex_proj);
 
   // Cleanup
   cudaFreeArray((cudaArray *)array_proj);
   CUDA_CHECK_ERROR;
+  cudaDestroyTextureObject(tex_proj);
+  CUDA_CHECK_ERROR;
 }
diff --git a/src/rtkCudaFDKWeightProjectionFilter.cu b/src/rtkCudaFDKWeightProjectionFilter.cu
index e4f4a45be..26e2e3b38 100644
--- a/src/rtkCudaFDKWeightProjectionFilter.cu
+++ b/src/rtkCudaFDKWeightProjectionFilter.cu
@@ -21,8 +21,6 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture
-
 inline __device__ float2
                   TransformIndexToPhysicalPoint(int2 idx, float2 origin, float2 row, float2 column)
 {
@@ -30,15 +28,16 @@ inline __device__ float2
 }
 
 __global__ void
-kernel_weight_projection(int2    proj_idx,
-                         int3    proj_size,
-                         int2    proj_size_buf_in,
-                         int2    proj_size_buf_out,
-                         float * dev_proj_in,
-                         float * dev_proj_out,
-                         float2  proj_orig, // projection origin
-                         float2  proj_row,  // projection row direction & spacing
-                         float2  proj_col   // projection col direction & spacing
+kernel_weight_projection(int2                proj_idx,
+                         int3                proj_size,
+                         int2                proj_size_buf_in,
+                         int2                proj_size_buf_out,
+                         float *             dev_proj_in,
+                         float *             dev_proj_out,
+                         float2              proj_orig, // projection origin
+                         float2              proj_row,  // projection row direction & spacing
+                         float2              proj_col,  // projection col direction & spacing
+                         cudaTextureObject_t tex_geom   // geometry texture object
 )
 {
   // compute projection index (== thread index)
@@ -53,19 +52,19 @@ kernel_weight_projection(int2    proj_idx,
   if (pIdx.x >= proj_size.x || pIdx.y >= proj_size.y || pIdx.z >= proj_size.z)
     return;
 
-  const float sdd = tex1Dfetch(tex_geometry, pIdx.z * 7 + 0);
-  const float sid = tex1Dfetch(tex_geometry, pIdx.z * 7 + 1);
-  const float wFac = tex1Dfetch(tex_geometry, pIdx.z * 7 + 5);
+  const float sdd = tex1Dfetch<float>(tex_geom, pIdx.z * 7);
+  const float sid = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 1);
+  const float wFac = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 5);
   if (sdd == 0) // parallel
   {
     dev_proj_out[pIdx_comp_out] = dev_proj_in[pIdx_comp_in] * wFac;
   }
   else // divergent
   {
-    const float pOffX = tex1Dfetch(tex_geometry, pIdx.z * 7 + 2);
-    const float pOffY = tex1Dfetch(tex_geometry, pIdx.z * 7 + 3);
-    const float sOffY = tex1Dfetch(tex_geometry, pIdx.z * 7 + 4);
-    const float tAngle = tex1Dfetch(tex_geometry, pIdx.z * 7 + 6);
+    const float pOffX = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 2);
+    const float pOffY = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 3);
+    const float sOffY = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 4);
+    const float tAngle = tex1Dfetch<float>(tex_geom, pIdx.z * 7 + 6);
     const float sina = sin(tAngle);
     const float cosa = cos(tAngle);
     const float tana = tan(tAngle);
@@ -90,19 +89,14 @@ CUDA_weight_projection(int     proj_idx[2],
                        int     proj_dim_buf_out[2],
                        float * dev_proj_in,
                        float * dev_proj_out,
-                       float * geometries,
+                       float * geometry,
                        float   proj_orig[2],
                        float   proj_row[2],
                        float   proj_col[2])
 {
-  // copy geometry matrix to device, bind the matrix to the texture
-  float * dev_geom;
-  cudaMalloc((void **)&dev_geom, proj_dim[2] * 7 * sizeof(float));
-  CUDA_CHECK_ERROR;
-  cudaMemcpy(dev_geom, geometries, proj_dim[2] * 7 * sizeof(float), cudaMemcpyHostToDevice);
-  CUDA_CHECK_ERROR;
-  cudaBindTexture(0, tex_geometry, dev_geom, proj_dim[2] * 7 * sizeof(float));
-  CUDA_CHECK_ERROR;
+  float *             dev_geom;
+  cudaTextureObject_t tex_geom;
+  prepareGeometryTextureObject(proj_dim[2], geometry, dev_geom, tex_geom, 7);
 
   // Thread Block Dimensions
   int tBlock_x = 16;
@@ -124,10 +118,11 @@ CUDA_weight_projection(int     proj_idx[2],
                                                   dev_proj_out,
                                                   make_float2(proj_orig[0], proj_orig[1]),
                                                   make_float2(proj_row[0], proj_row[1]),
-                                                  make_float2(proj_col[0], proj_col[1]));
+                                                  make_float2(proj_col[0], proj_col[1]),
+                                                  tex_geom);
 
-  // Unbind matrix texture
-  cudaUnbindTexture(tex_geometry);
+  // destroy texture object
+  cudaDestroyTextureObject(tex_geom);
   CUDA_CHECK_ERROR;
   cudaFree(dev_geom);
   CUDA_CHECK_ERROR;
diff --git a/src/rtkCudaForwardProjectionImageFilter.cu b/src/rtkCudaForwardProjectionImageFilter.cu
index 3940d0f0d..869d263f9 100644
--- a/src/rtkCudaForwardProjectionImageFilter.cu
+++ b/src/rtkCudaForwardProjectionImageFilter.cu
@@ -36,7 +36,6 @@
  * CUDA #includes *
  *****************/
 #include <cuda.h>
-#include <cublas_v2.h>
 #include <cuda_runtime.h>
 
 // CONSTANTS //
@@ -201,7 +200,7 @@ CUDA_forward_project(int          projSize[3],
   // Prepare texture objects
   std::vector<cudaArray *>         volComponentArrays;
   std::vector<cudaTextureObject_t> tex_vol;
-  prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false);
+  prepareVectorTextureObject(volSize, dev_vol, volComponentArrays, vectorLength, tex_vol, false, cudaAddressModeClamp);
 
   // Copy them to a device pointer, since it will have to be de-referenced in the kernels
   cudaTextureObject_t * dev_tex_vol;
diff --git a/src/rtkCudaForwardWarpImageFilter.cu b/src/rtkCudaForwardWarpImageFilter.cu
index 1f9a04956..530dc33b6 100644
--- a/src/rtkCudaForwardWarpImageFilter.cu
+++ b/src/rtkCudaForwardWarpImageFilter.cu
@@ -39,15 +39,11 @@
  *****************/
 #include <cuda.h>
 
-// T E X T U R E S ////////////////////////////////////////////////////////
-texture<float, 1, cudaReadModeElementType> tex_IndexInputToPPInputMatrix;
-texture<float, 1, cudaReadModeElementType> tex_IndexInputToIndexDVFMatrix;
-texture<float, 1, cudaReadModeElementType> tex_PPOutputToIndexOutputMatrix;
-
-texture<float, 3, cudaReadModeElementType> tex_xdvf;
-texture<float, 3, cudaReadModeElementType> tex_ydvf;
-texture<float, 3, cudaReadModeElementType> tex_zdvf;
-///////////////////////////////////////////////////////////////////////////
+// CONSTANTS //////////////////////////////////////////////////////////////
+__constant__ float c_IndexInputToPPInputMatrix[12];
+__constant__ float c_IndexInputToIndexDVFMatrix[12];
+__constant__ float c_PPOutputToIndexOutputMatrix[12];
+////////////////////////////////////////////////////////////////////////////
 
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 // K E R N E L S -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
@@ -129,7 +125,14 @@ normalize_3Dgrid(float * dev_vol_out, float * dev_accumulate_weights, int3 out_d
 }
 
 __global__ void
-linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumulate_weights, int3 in_dim, int3 out_dim)
+linearSplat_3Dgrid(float *             dev_vol_in,
+                   float *             dev_vol_out,
+                   float *             dev_accumulate_weights,
+                   int3                in_dim,
+                   int3                out_dim,
+                   cudaTextureObject_t tex_xdvf,
+                   cudaTextureObject_t tex_ydvf,
+                   cudaTextureObject_t tex_zdvf)
 {
   unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
   unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
@@ -141,32 +144,21 @@ linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumula
   }
 
   // Index row major into the volume
+  float3   idx = make_float3(i, j, k);
   long int in_idx = i + (j + k * in_dim.y) * (in_dim.x);
 
   // Matrix multiply to get the index in the DVF texture of the current point in the output volume
-  float3 IndexInDVF;
-  IndexInDVF.x = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 0) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 1) * j +
-                 tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 2) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 3);
-  IndexInDVF.y = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 4) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 5) * j +
-                 tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 6) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 7);
-  IndexInDVF.z = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 8) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 9) * j +
-                 tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 10) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 11);
+  float3 IndexInDVF = matrix_multiply(idx, c_IndexInputToIndexDVFMatrix);
 
   // Get each component of the displacement vector by
   // interpolation in the dvf
   float3 Displacement;
-  Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-  Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-  Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+  Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+  Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+  Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
 
   // Matrix multiply to get the physical coordinates of the current point in the input volume
-  float3 PPinInput;
-  PPinInput.x = tex1Dfetch(tex_IndexInputToPPInputMatrix, 0) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 1) * j +
-                tex1Dfetch(tex_IndexInputToPPInputMatrix, 2) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 3);
-  PPinInput.y = tex1Dfetch(tex_IndexInputToPPInputMatrix, 4) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 5) * j +
-                tex1Dfetch(tex_IndexInputToPPInputMatrix, 6) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 7);
-  PPinInput.z = tex1Dfetch(tex_IndexInputToPPInputMatrix, 8) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 9) * j +
-                tex1Dfetch(tex_IndexInputToPPInputMatrix, 10) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 11);
+  float3 PPinInput = matrix_multiply(idx, c_IndexInputToPPInputMatrix);
 
   // Get the index corresponding to the current physical point in output displaced by the displacement vector
   float3 PPDisplaced;
@@ -174,19 +166,7 @@ linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumula
   PPDisplaced.y = PPinInput.y + Displacement.y;
   PPDisplaced.z = PPinInput.z + Displacement.z;
 
-  float3 IndexInOutput;
-  IndexInOutput.x = tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 0) * PPDisplaced.x +
-                    tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 1) * PPDisplaced.y +
-                    tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 2) * PPDisplaced.z +
-                    tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 3);
-  IndexInOutput.y = tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 4) * PPDisplaced.x +
-                    tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 5) * PPDisplaced.y +
-                    tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 6) * PPDisplaced.z +
-                    tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 7);
-  IndexInOutput.z = tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 8) * PPDisplaced.x +
-                    tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 9) * PPDisplaced.y +
-                    tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 10) * PPDisplaced.z +
-                    tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 11);
+  float3 IndexInOutput = matrix_multiply(PPDisplaced, c_PPOutputToIndexOutputMatrix);
 
   // Compute the splat weights
   int3 BaseIndexInOutput;
@@ -303,11 +283,14 @@ linearSplat_3Dgrid(float * dev_vol_in, float * dev_vol_out, float * dev_accumula
 }
 
 __global__ void
-nearestNeighborSplat_3Dgrid(float * dev_vol_in,
-                            float * dev_vol_out,
-                            float * dev_accumulate_weights,
-                            int3    in_dim,
-                            int3    out_dim)
+nearestNeighborSplat_3Dgrid(float *             dev_vol_in,
+                            float *             dev_vol_out,
+                            float *             dev_accumulate_weights,
+                            int3                in_dim,
+                            int3                out_dim,
+                            cudaTextureObject_t tex_xdvf,
+                            cudaTextureObject_t tex_ydvf,
+                            cudaTextureObject_t tex_zdvf)
 {
   unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
   unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
@@ -322,29 +305,18 @@ nearestNeighborSplat_3Dgrid(float * dev_vol_in,
   long int in_idx = i + (j + k * in_dim.y) * (in_dim.x);
 
   // Matrix multiply to get the index in the DVF texture of the current point in the output volume
-  float3 IndexInDVF;
-  IndexInDVF.x = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 0) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 1) * j +
-                 tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 2) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 3);
-  IndexInDVF.y = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 4) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 5) * j +
-                 tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 6) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 7);
-  IndexInDVF.z = tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 8) * i + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 9) * j +
-                 tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 10) * k + tex1Dfetch(tex_IndexInputToIndexDVFMatrix, 11);
+  float3 idx = make_float3(i, j, k);
+  float3 IndexInDVF = matrix_multiply(idx, c_IndexInputToIndexDVFMatrix);
 
   // Get each component of the displacement vector by
   // interpolation in the dvf
   float3 Displacement;
-  Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-  Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-  Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+  Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+  Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+  Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
 
   // Matrix multiply to get the physical coordinates of the current point in the input volume
-  float3 PPinInput;
-  PPinInput.x = tex1Dfetch(tex_IndexInputToPPInputMatrix, 0) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 1) * j +
-                tex1Dfetch(tex_IndexInputToPPInputMatrix, 2) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 3);
-  PPinInput.y = tex1Dfetch(tex_IndexInputToPPInputMatrix, 4) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 5) * j +
-                tex1Dfetch(tex_IndexInputToPPInputMatrix, 6) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 7);
-  PPinInput.z = tex1Dfetch(tex_IndexInputToPPInputMatrix, 8) * i + tex1Dfetch(tex_IndexInputToPPInputMatrix, 9) * j +
-                tex1Dfetch(tex_IndexInputToPPInputMatrix, 10) * k + tex1Dfetch(tex_IndexInputToPPInputMatrix, 11);
+  float3 PPinInput = matrix_multiply(idx, c_IndexInputToPPInputMatrix);
 
   // Get the index corresponding to the current physical point in output displaced by the displacement vector
   float3 PPDisplaced;
@@ -352,19 +324,10 @@ nearestNeighborSplat_3Dgrid(float * dev_vol_in,
   PPDisplaced.y = PPinInput.y + Displacement.y;
   PPDisplaced.z = PPinInput.z + Displacement.z;
 
-  float3 IndexInOutput;
-  IndexInOutput.x = floor(tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 0) * PPDisplaced.x +
-                          tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 1) * PPDisplaced.y +
-                          tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 2) * PPDisplaced.z +
-                          tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 3) + 0.5);
-  IndexInOutput.y = floor(tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 4) * PPDisplaced.x +
-                          tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 5) * PPDisplaced.y +
-                          tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 6) * PPDisplaced.z +
-                          tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 7) + 0.5);
-  IndexInOutput.z = floor(tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 8) * PPDisplaced.x +
-                          tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 9) * PPDisplaced.y +
-                          tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 10) * PPDisplaced.z +
-                          tex1Dfetch(tex_PPOutputToIndexOutputMatrix, 11) + 0.5);
+  float3 IndexInOutput = matrix_multiply(PPDisplaced, c_PPOutputToIndexOutputMatrix);
+  IndexInOutput.x = floor(IndexInOutput.x + 0.5);
+  IndexInOutput.y = floor(IndexInOutput.y + 0.5);
+  IndexInOutput.z = floor(IndexInOutput.z + 0.5);
 
   bool isInVolume = (IndexInOutput.x >= 0) && (IndexInOutput.x < out_dim.x) && (IndexInOutput.y >= 0) &&
                     (IndexInOutput.y < out_dim.y) && (IndexInOutput.z >= 0) && (IndexInOutput.z < out_dim.z);
@@ -400,101 +363,29 @@ CUDA_ForwardWarp(int     input_vol_dim[3],
                  float * dev_output_vol,
                  bool    isLinear)
 {
-
   // Prepare channel description for arrays
   static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 
   ///////////////////////////////////
-  // For each component of the dvf, perform a strided copy (pick every third
-  // float from dev_input_dvf) into a 3D array, and bind the array to a 3D texture
-
   // Extent stuff, will be used for each component extraction
-  cudaExtent dvfExtent = make_cudaExtent(input_dvf_dim[0], input_dvf_dim[1], input_dvf_dim[2]);
-
-  // Set texture parameters
-  tex_xdvf.addressMode[0] = cudaAddressModeBorder;
-  tex_xdvf.addressMode[1] = cudaAddressModeBorder;
-  tex_xdvf.addressMode[2] = cudaAddressModeBorder;
-  tex_xdvf.filterMode = cudaFilterModeLinear;
-  tex_xdvf.normalized = false; // don't access with normalized texture coords
-
-  tex_ydvf.addressMode[0] = cudaAddressModeBorder;
-  tex_ydvf.addressMode[1] = cudaAddressModeBorder;
-  tex_ydvf.addressMode[2] = cudaAddressModeBorder;
-  tex_ydvf.filterMode = cudaFilterModeLinear;
-  tex_ydvf.normalized = false;
-
-  tex_zdvf.addressMode[0] = cudaAddressModeBorder;
-  tex_zdvf.addressMode[1] = cudaAddressModeBorder;
-  tex_zdvf.addressMode[2] = cudaAddressModeBorder;
-  tex_zdvf.filterMode = cudaFilterModeLinear;
-  tex_zdvf.normalized = false;
-
-  // Allocate the arrays
-  cudaArray * array_xdvf;
-  cudaArray * array_ydvf;
-  cudaArray * array_zdvf;
-  cudaMalloc3DArray((cudaArray **)&array_xdvf, &channelDesc, dvfExtent);
-  cudaMalloc3DArray((cudaArray **)&array_ydvf, &channelDesc, dvfExtent);
-  cudaMalloc3DArray((cudaArray **)&array_zdvf, &channelDesc, dvfExtent);
-  CUDA_CHECK_ERROR;
-
-  // Copy image data to arrays. For a nice explanation on make_cudaPitchedPtr, checkout
-  // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api
-  cudaMemcpy3DParms xCopyParams = cudaMemcpy3DParms();
-  xCopyParams.srcPtr =
-    make_cudaPitchedPtr(dev_input_xdvf, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]);
-  xCopyParams.dstArray = (cudaArray *)array_xdvf;
-  xCopyParams.extent = dvfExtent;
-  xCopyParams.kind = cudaMemcpyDeviceToDevice;
-  cudaMemcpy3D(&xCopyParams);
-  CUDA_CHECK_ERROR;
-
-  cudaMemcpy3DParms yCopyParams = cudaMemcpy3DParms();
-  yCopyParams.srcPtr =
-    make_cudaPitchedPtr(dev_input_ydvf, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]);
-  yCopyParams.dstArray = (cudaArray *)array_ydvf;
-  yCopyParams.extent = dvfExtent;
-  yCopyParams.kind = cudaMemcpyDeviceToDevice;
-  cudaMemcpy3D(&yCopyParams);
-  CUDA_CHECK_ERROR;
-
-  cudaMemcpy3DParms zCopyParams = cudaMemcpy3DParms();
-  zCopyParams.srcPtr =
-    make_cudaPitchedPtr(dev_input_zdvf, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]);
-  zCopyParams.dstArray = (cudaArray *)array_zdvf;
-  zCopyParams.extent = dvfExtent;
-  zCopyParams.kind = cudaMemcpyDeviceToDevice;
-  cudaMemcpy3D(&zCopyParams);
-  CUDA_CHECK_ERROR;
-
-  // Bind 3D arrays to 3D textures
-  cudaBindTextureToArray(tex_xdvf, (cudaArray *)array_xdvf, channelDesc);
-  cudaBindTextureToArray(tex_ydvf, (cudaArray *)array_ydvf, channelDesc);
-  cudaBindTextureToArray(tex_zdvf, (cudaArray *)array_zdvf, channelDesc);
-  CUDA_CHECK_ERROR;
+  cudaArray *         array_xdvf, *array_ydvf, *array_zdvf;
+  cudaTextureObject_t tex_xdvf, tex_ydvf, tex_zdvf;
+  prepareScalarTextureObject(input_dvf_dim, dev_input_xdvf, array_xdvf, tex_xdvf, false);
+  prepareScalarTextureObject(input_dvf_dim, dev_input_ydvf, array_ydvf, tex_ydvf, false);
+  prepareScalarTextureObject(input_dvf_dim, dev_input_zdvf, array_zdvf, tex_zdvf, false);
 
   ///////////////////////////////////////
-  // Copy matrices, bind them to textures
-
-  float * dev_IndexInputToPPInput;
-  cudaMalloc((void **)&dev_IndexInputToPPInput, 12 * sizeof(float));
-  cudaMemcpy(dev_IndexInputToPPInput, IndexInputToPPInputMatrix, 12 * sizeof(float), cudaMemcpyHostToDevice);
-  cudaBindTexture(0, tex_IndexInputToPPInputMatrix, dev_IndexInputToPPInput, 12 * sizeof(float));
-
-  float * dev_IndexInputToIndexDVF;
-  cudaMalloc((void **)&dev_IndexInputToIndexDVF, 12 * sizeof(float));
-  cudaMemcpy(dev_IndexInputToIndexDVF, IndexInputToIndexDVFMatrix, 12 * sizeof(float), cudaMemcpyHostToDevice);
-  cudaBindTexture(0, tex_IndexInputToIndexDVFMatrix, dev_IndexInputToIndexDVF, 12 * sizeof(float));
-
-  float * dev_PPOutputToIndexOutput;
-  cudaMalloc((void **)&dev_PPOutputToIndexOutput, 12 * sizeof(float));
-  cudaMemcpy(dev_PPOutputToIndexOutput, PPOutputToIndexOutputMatrix, 12 * sizeof(float), cudaMemcpyHostToDevice);
-  cudaBindTexture(0, tex_PPOutputToIndexOutputMatrix, dev_PPOutputToIndexOutput, 12 * sizeof(float));
+  // Copy matrices into constant memory
+  cudaMemcpyToSymbol(
+    c_IndexInputToPPInputMatrix, IndexInputToPPInputMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice);
+  cudaMemcpyToSymbol(
+    c_IndexInputToIndexDVFMatrix, IndexInputToIndexDVFMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice);
+  cudaMemcpyToSymbol(
+    c_PPOutputToIndexOutputMatrix, PPOutputToIndexOutputMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice);
+  CUDA_CHECK_ERROR;
 
   ///////////////////////////////////////
   /// Initialize the output
-
   cudaMemset((void *)dev_output_vol, 0, sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]);
 
   //////////////////////////////////////
@@ -507,12 +398,6 @@ CUDA_ForwardWarp(int     input_vol_dim[3],
   cudaMemset(
     (void *)dev_accumulate_weights, 0, sizeof(float) * output_vol_dim[0] * output_vol_dim[1] * output_vol_dim[2]);
 
-  //////////////////////////////////////
-  /// Run
-
-  int device;
-  cudaGetDevice(&device);
-
   // Thread Block Dimensions
   constexpr int tBlock_x = 16;
   constexpr int tBlock_y = 4;
@@ -533,14 +418,20 @@ CUDA_ForwardWarp(int     input_vol_dim[3],
                                               dev_output_vol,
                                               dev_accumulate_weights,
                                               make_int3(input_vol_dim[0], input_vol_dim[1], input_vol_dim[2]),
-                                              make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]));
+                                              make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]),
+                                              tex_xdvf,
+                                              tex_ydvf,
+                                              tex_zdvf);
   else
     nearestNeighborSplat_3Dgrid<<<dimGrid, dimBlock>>>(
       dev_input_vol,
       dev_output_vol,
       dev_accumulate_weights,
       make_int3(input_vol_dim[0], input_vol_dim[1], input_vol_dim[2]),
-      make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]));
+      make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]),
+      tex_xdvf,
+      tex_ydvf,
+      tex_zdvf);
 
   CUDA_CHECK_ERROR;
 
@@ -552,25 +443,19 @@ CUDA_ForwardWarp(int     input_vol_dim[3],
     dev_output_vol, dev_accumulate_weights, make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]));
   CUDA_CHECK_ERROR;
 
-  // Unbind the image and projection matrix textures
-  cudaUnbindTexture(tex_xdvf);
-  cudaUnbindTexture(tex_ydvf);
-  cudaUnbindTexture(tex_zdvf);
+  // Cleanup
+  cudaFreeArray(array_xdvf);
   CUDA_CHECK_ERROR;
-  cudaUnbindTexture(tex_IndexInputToPPInputMatrix);
-  cudaUnbindTexture(tex_PPOutputToIndexOutputMatrix);
-  cudaUnbindTexture(tex_IndexInputToIndexDVFMatrix);
+  cudaFreeArray(array_ydvf);
   CUDA_CHECK_ERROR;
-
-  // Cleanup
-  cudaFreeArray((cudaArray *)array_xdvf);
-  cudaFreeArray((cudaArray *)array_ydvf);
-  cudaFreeArray((cudaArray *)array_zdvf);
+  cudaFreeArray(array_zdvf);
   CUDA_CHECK_ERROR;
   cudaFree(dev_accumulate_weights);
   CUDA_CHECK_ERROR;
-  cudaFree(dev_IndexInputToPPInput);
-  cudaFree(dev_PPOutputToIndexOutput);
-  cudaFree(dev_IndexInputToIndexDVF);
+  cudaDestroyTextureObject(tex_xdvf);
+  CUDA_CHECK_ERROR;
+  cudaDestroyTextureObject(tex_ydvf);
+  CUDA_CHECK_ERROR;
+  cudaDestroyTextureObject(tex_zdvf);
   CUDA_CHECK_ERROR;
 }
diff --git a/src/rtkCudaParkerShortScanImageFilter.cu b/src/rtkCudaParkerShortScanImageFilter.cu
index 8e3c6f734..be4656371 100644
--- a/src/rtkCudaParkerShortScanImageFilter.cu
+++ b/src/rtkCudaParkerShortScanImageFilter.cu
@@ -22,8 +22,6 @@
 #include <cuda_runtime.h>
 #include <math_constants.h>
 
-texture<float, 1, cudaReadModeElementType> tex_geometry; // geometry texture
-
 inline __device__ float
 TransformIndexToPhysicalPoint(int2 idx, float origin, float row, float column)
 {
@@ -45,17 +43,18 @@ ToUntiltedCoordinateAtIsocenter(float tiltedCoord, float sdd, float sid, float s
 }
 
 __global__ void
-kernel_parker_weight(int2    proj_idx,
-                     int3    proj_size,
-                     int2    proj_size_buf_in,
-                     int2    proj_size_buf_out,
-                     float * dev_proj_in,
-                     float * dev_proj_out,
-                     float   delta,
-                     float   firstAngle,
-                     float   proj_orig, // projection origin
-                     float   proj_row,  // projection row direction & spacing
-                     float   proj_col   // projection col direction & spacing
+kernel_parker_weight(int2                proj_idx,
+                     int3                proj_size,
+                     int2                proj_size_buf_in,
+                     int2                proj_size_buf_out,
+                     float *             dev_proj_in,
+                     float *             dev_proj_out,
+                     float               delta,
+                     float               firstAngle,
+                     float               proj_orig, // projection origin
+                     float               proj_row,  // projection row direction & spacing
+                     float               proj_col,  // projection col direction & spacing
+                     cudaTextureObject_t tex_geom   // geometry texture object
 )
 {
   // compute projection index (== thread index)
@@ -70,10 +69,10 @@ kernel_parker_weight(int2    proj_idx,
   if (pIdx.x >= proj_size.x || pIdx.y >= proj_size.y || pIdx.z >= proj_size.z)
     return;
 
-  float sdd = tex1Dfetch(tex_geometry, pIdx.z * 5 + 0);
-  float sx = tex1Dfetch(tex_geometry, pIdx.z * 5 + 1);
-  float px = tex1Dfetch(tex_geometry, pIdx.z * 5 + 2);
-  float sid = tex1Dfetch(tex_geometry, pIdx.z * 5 + 3);
+  float sdd = tex1Dfetch<float>(tex_geom, pIdx.z * 5 + 0);
+  float sx = tex1Dfetch<float>(tex_geom, pIdx.z * 5 + 1);
+  float px = tex1Dfetch<float>(tex_geom, pIdx.z * 5 + 2);
+  float sid = tex1Dfetch<float>(tex_geom, pIdx.z * 5 + 3);
 
   // convert actual index to point
   float pPoint =
@@ -86,7 +85,7 @@ kernel_parker_weight(int2    proj_idx,
   float alpha = atan(-1 * l * invsid);
 
   // beta projection angle: Parker's article assumes that the scan starts at 0
-  float beta = tex1Dfetch(tex_geometry, pIdx.z * 5 + 4);
+  float beta = tex1Dfetch<float>(tex_geom, pIdx.z * 5 + 4);
   beta -= firstAngle;
   if (beta < 0)
     beta += (2.f * CUDART_PI_F);
@@ -111,21 +110,16 @@ CUDA_parker_weight(int     proj_idx[2],
                    int     proj_dim_buf_out[2],
                    float * dev_proj_in,
                    float * dev_proj_out,
-                   float * geometries,
+                   float * geometry,
                    float   delta,
                    float   firstAngle,
                    float   proj_orig,
                    float   proj_row,
                    float   proj_col)
 {
-  // copy geometry matrix to device, bind the matrix to the texture
-  float * dev_geom;
-  cudaMalloc((void **)&dev_geom, proj_dim[2] * 5 * sizeof(float));
-  CUDA_CHECK_ERROR;
-  cudaMemcpy(dev_geom, geometries, proj_dim[2] * 5 * sizeof(float), cudaMemcpyHostToDevice);
-  CUDA_CHECK_ERROR;
-  cudaBindTexture(0, tex_geometry, dev_geom, proj_dim[2] * 5 * sizeof(float));
-  CUDA_CHECK_ERROR;
+  float *             dev_geom;
+  cudaTextureObject_t tex_geom;
+  prepareGeometryTextureObject(proj_dim[2], geometry, dev_geom, tex_geom, 5);
 
   // Thread Block Dimensions
   int tBlock_x = 16;
@@ -139,6 +133,7 @@ CUDA_parker_weight(int     proj_idx[2],
 
   dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
   dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
+
   kernel_parker_weight<<<dimGrid, dimBlock>>>(make_int2(proj_idx[0], proj_idx[1]),
                                               make_int3(proj_dim[0], proj_dim[1], proj_dim[2]),
                                               make_int2(proj_dim_buf_in[0], proj_dim_buf_in[1]),
@@ -149,10 +144,11 @@ CUDA_parker_weight(int     proj_idx[2],
                                               firstAngle,
                                               proj_orig,
                                               proj_row,
-                                              proj_col);
+                                              proj_col,
+                                              tex_geom);
 
-  // Unbind matrix texture
-  cudaUnbindTexture(tex_geometry);
+  // destroy texture object
+  cudaDestroyTextureObject(tex_geom);
   CUDA_CHECK_ERROR;
   cudaFree(dev_geom);
   CUDA_CHECK_ERROR;
diff --git a/src/rtkCudaUtilities.cu b/src/rtkCudaUtilities.cu
index 4f3938d05..020fac6cc 100644
--- a/src/rtkCudaUtilities.cu
+++ b/src/rtkCudaUtilities.cu
@@ -17,6 +17,7 @@
  *=========================================================================*/
 
 #include "rtkCudaUtilities.hcu"
+#include <cublas_v2.h>
 
 std::vector<int>
 GetListOfCudaDevices()
@@ -76,13 +77,73 @@ GetFreeGPUGlobalMemory(int device)
   return free;
 }
 
+__host__ void
+prepareScalarTextureObject(int                          size[3],
+                           float *                      dev_ptr,
+                           cudaArray *&                 threeDArray,
+                           cudaTextureObject_t &        tex,
+                           const bool                   isProjections,
+                           const bool                   isLinear,
+                           const cudaTextureAddressMode texAddressMode)
+{
+  // create texture object
+  cudaResourceDesc resDesc = {};
+  resDesc.resType = cudaResourceTypeArray;
+
+  cudaTextureDesc texDesc = {};
+  texDesc.readMode = cudaReadModeElementType;
+
+  for (int component = 0; component < 3; component++)
+    texDesc.addressMode[component] = texAddressMode;
+  if (isLinear)
+    texDesc.filterMode = cudaFilterModeLinear;
+  else
+    texDesc.filterMode = cudaFilterModePoint;
+
+  static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+  cudaExtent                   volExtent = make_cudaExtent(size[0], size[1], size[2]);
+
+  // Allocate an intermediate memory space to extract the components of the input volume
+  float * singleComponent;
+  int     numel = size[0] * size[1] * size[2];
+  cudaMalloc(&singleComponent, numel * sizeof(float));
+  CUDA_CHECK_ERROR;
+
+  // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr.
+  // The best way to understand it is to read
+  // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api
+
+  // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays
+  if (isProjections)
+    cudaMalloc3DArray(&threeDArray, &channelDesc, volExtent, cudaArrayLayered);
+  else
+    cudaMalloc3DArray(&threeDArray, &channelDesc, volExtent);
+  CUDA_CHECK_ERROR;
+
+  // Fill it with the current singleComponent
+  cudaMemcpy3DParms CopyParams = {};
+  CopyParams.srcPtr = make_cudaPitchedPtr(dev_ptr, size[0] * sizeof(float), size[0], size[1]);
+  CUDA_CHECK_ERROR;
+  CopyParams.dstArray = threeDArray;
+  CopyParams.extent = volExtent;
+  CopyParams.kind = cudaMemcpyDeviceToDevice;
+  cudaMemcpy3D(&CopyParams);
+  CUDA_CHECK_ERROR;
+
+  // Fill in the texture object with all this information
+  resDesc.res.array.array = threeDArray;
+  cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
+  CUDA_CHECK_ERROR;
+}
+
 __host__ void
 prepareVectorTextureObject(int                                size[3],
                            const float *                      dev_ptr,
                            std::vector<cudaArray *> &         componentArrays,
                            const unsigned int                 nComponents,
                            std::vector<cudaTextureObject_t> & tex,
-                           bool                               isProjections)
+                           const bool                         isProjections,
+                           const cudaTextureAddressMode       texAddressMode)
 {
   componentArrays.resize(nComponents);
   tex.resize(nComponents);
@@ -92,20 +153,13 @@ prepareVectorTextureObject(int                                size[3],
   cublasCreate(&handle);
 
   // create texture object
-  cudaResourceDesc resDesc;
-  memset(&resDesc, 0, sizeof(resDesc));
+  cudaResourceDesc resDesc = {};
   resDesc.resType = cudaResourceTypeArray;
 
-  cudaTextureDesc texDesc;
-  memset(&texDesc, 0, sizeof(texDesc));
+  cudaTextureDesc texDesc = {};
   texDesc.readMode = cudaReadModeElementType;
   for (int component = 0; component < 3; component++)
-  {
-    if (isProjections)
-      texDesc.addressMode[component] = cudaAddressModeBorder;
-    else
-      texDesc.addressMode[component] = cudaAddressModeClamp;
-  }
+    texDesc.addressMode[component] = texAddressMode;
   texDesc.filterMode = cudaFilterModeLinear;
 
   static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
@@ -127,7 +181,7 @@ prepareVectorTextureObject(int                                size[3],
     cudaMemset((void *)singleComponent, 0, numel * sizeof(float));
 
     // Fill it with the current component
-    float * pComponent = dev_ptr + component;
+    const float * pComponent = dev_ptr + component;
     cublasSaxpy(handle, numel, &one, pComponent, nComponents, singleComponent, 1);
 
     // Allocate the cudaArray. Projections use layered arrays, volumes use default 3D arrays
@@ -140,6 +194,7 @@ prepareVectorTextureObject(int                                size[3],
     // Fill it with the current singleComponent
     cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms();
     CopyParams.srcPtr = make_cudaPitchedPtr(singleComponent, size[0] * sizeof(float), size[0], size[1]);
+    CUDA_CHECK_ERROR;
     CopyParams.dstArray = componentArrays[component];
     CopyParams.extent = volExtent;
     CopyParams.kind = cudaMemcpyDeviceToDevice;
@@ -158,3 +213,31 @@ prepareVectorTextureObject(int                                size[3],
   // Destroy CUBLAS context
   cublasDestroy(handle);
 }
+
+__host__ void
+prepareGeometryTextureObject(int                   length,
+                             const float *         geometry,
+                             float *&              dev_geom,
+                             cudaTextureObject_t & tex_geom,
+                             const unsigned int    nParam)
+{
+  // copy geometry matrix to device, bind the matrix to the texture
+  cudaMalloc((void **)&dev_geom, length * nParam * sizeof(float));
+  CUDA_CHECK_ERROR;
+  cudaMemcpy(dev_geom, geometry, length * nParam * sizeof(float), cudaMemcpyHostToDevice);
+  CUDA_CHECK_ERROR;
+
+  // create texture object
+  cudaResourceDesc resDesc = {};
+  resDesc.resType = cudaResourceTypeLinear;
+  resDesc.res.linear.devPtr = dev_geom;
+  resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
+  resDesc.res.linear.desc.x = 32; // bits per channel
+  resDesc.res.linear.sizeInBytes = length * nParam * sizeof(float);
+
+  cudaTextureDesc texDesc = {};
+  texDesc.readMode = cudaReadModeElementType;
+
+  cudaCreateTextureObject(&tex_geom, &resDesc, &texDesc, NULL);
+  CUDA_CHECK_ERROR;
+}
diff --git a/src/rtkCudaWarpBackProjectionImageFilter.cu b/src/rtkCudaWarpBackProjectionImageFilter.cu
index 333b6dfba..a26d57f5f 100644
--- a/src/rtkCudaWarpBackProjectionImageFilter.cu
+++ b/src/rtkCudaWarpBackProjectionImageFilter.cu
@@ -38,17 +38,8 @@
  * CUDA #includes *
  *****************/
 #include <cuda.h>
-#include <cublas_v2.h>
 #include <cuda_runtime.h>
 
-// T E X T U R E S ////////////////////////////////////////////////////////
-texture<float, cudaTextureType2DLayered> tex_proj;
-
-texture<float, 3, cudaReadModeElementType> tex_xdvf;
-texture<float, 3, cudaReadModeElementType> tex_ydvf;
-texture<float, 3, cudaReadModeElementType> tex_zdvf;
-///////////////////////////////////////////////////////////////////////////
-
 // CONSTANTS //////////////////////////////////////////////////////////////
 __constant__ float c_matrices[SLAB_SIZE * 12]; // Can process stacks of at most SLAB_SIZE projections
 __constant__ float c_volIndexToProjPP[SLAB_SIZE * 12];
@@ -66,7 +57,12 @@ __constant__ float c_IndexInputToPPInputMatrix[12];
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 
 __global__ void
-kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out)
+kernel_warp_back_project_3Dgrid(float *             dev_vol_in,
+                                float *             dev_vol_out,
+                                cudaTextureObject_t tex_xdvf,
+                                cudaTextureObject_t tex_ydvf,
+                                cudaTextureObject_t tex_zdvf,
+                                cudaTextureObject_t tex_proj)
 {
   unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
   unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
@@ -89,9 +85,9 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out)
     IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix);
 
     // Get each component of the displacement vector by interpolation in the DVF
-    Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-    Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-    Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+    Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+    Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+    Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
 
     // Compute the physical point in input + the displacement vector
     PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement;
@@ -108,7 +104,7 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out)
     ip.y = ip.y * ip.z;
 
     // Get texture point, clip left to GPU
-    voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj);
+    voxel_data += tex2DLayered<float>(tex_proj, ip.x, ip.y, proj);
   }
 
   // Place it into the volume
@@ -116,7 +112,13 @@ kernel_warp_back_project_3Dgrid(float * dev_vol_in, float * dev_vol_out)
 }
 
 __global__ void
-kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, float * dev_vol_out, float radius)
+kernel_warp_back_project_3Dgrid_cylindrical_detector(float *             dev_vol_in,
+                                                     float *             dev_vol_out,
+                                                     float               radius,
+                                                     cudaTextureObject_t tex_xdvf,
+                                                     cudaTextureObject_t tex_ydvf,
+                                                     cudaTextureObject_t tex_zdvf,
+                                                     cudaTextureObject_t tex_proj)
 {
   unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
   unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
@@ -139,9 +141,9 @@ kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, float *
     IndexInDVF = matrix_multiply(make_float3(i, j, k), c_IndexInputToIndexDVFMatrix);
 
     // Get each component of the displacement vector by interpolation in the DVF
-    Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-    Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-    Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+    Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+    Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+    Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
 
     // Compute the physical point in input + the displacement vector
     PP = matrix_multiply(make_float3(i, j, k), c_IndexInputToPPInputMatrix) + Displacement;
@@ -167,7 +169,7 @@ kernel_warp_back_project_3Dgrid_cylindrical_detector(float * dev_vol_in, float *
     ip.y = c_projPPToProjIndex[3] * pp.x + c_projPPToProjIndex[4] * pp.y + c_projPPToProjIndex[5];
 
     // Get texture point, clip left to GPU
-    voxel_data += tex2DLayered(tex_proj, ip.x, ip.y, proj);
+    voxel_data += tex2DLayered<float>(tex_proj, ip.x, ip.y, proj);
   }
 
   // Place it into the volume
@@ -196,13 +198,6 @@ CUDA_warp_back_project(int     projSize[3],
                        float   IndexInputToPPInputMatrix[12],
                        double  radiusCylindricalDetector)
 {
-  // Create CUBLAS context
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-
-  int device;
-  cudaGetDevice(&device);
-
   // Copy the size of inputs into constant memory
   cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3));
   cudaMemcpyToSymbol(c_volSize, volSize, sizeof(int3));
@@ -212,96 +207,15 @@ CUDA_warp_back_project(int     projSize[3],
   cudaMemcpyToSymbol(c_volIndexToProjPP, &(volIndexToProjPPs[0]), 12 * sizeof(float) * projSize[2]);
   cudaMemcpyToSymbol(c_projPPToProjIndex, &(projPPToProjIndex[0]), 9 * sizeof(float));
 
-  // set texture parameters
-  tex_proj.addressMode[0] = cudaAddressModeBorder;
-  tex_proj.addressMode[1] = cudaAddressModeBorder;
-  tex_proj.addressMode[2] = cudaAddressModeBorder;
-  tex_proj.filterMode = cudaFilterModeLinear;
-  tex_proj.normalized = false; // don't access with normalized texture coords
-
-  // Copy projection data to array, bind the array to the texture
-  cudaExtent                   projExtent = make_cudaExtent(projSize[0], projSize[1], projSize[2]);
-  cudaArray *                  array_proj;
-  static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-  CUDA_CHECK_ERROR;
-
-  // Allocate array for input projections, in order to bind them to
-  // a 2D layered texture
-  cudaMalloc3DArray((cudaArray **)&array_proj, &channelDesc, projExtent, cudaArrayLayered);
-  CUDA_CHECK_ERROR;
-
-  // Copy data to 3D array
-  cudaMemcpy3DParms copyParams = cudaMemcpy3DParms();
-  copyParams.srcPtr = make_cudaPitchedPtr(dev_proj, projSize[0] * sizeof(float), projSize[0], projSize[1]);
-  copyParams.dstArray = (cudaArray *)array_proj;
-  copyParams.extent = projExtent;
-  copyParams.kind = cudaMemcpyDeviceToDevice;
-  cudaMemcpy3D(&copyParams);
-  CUDA_CHECK_ERROR;
+  // Prepare proj texture
+  cudaArray *         array_proj;
+  cudaTextureObject_t tex_proj;
+  prepareScalarTextureObject(projSize, dev_proj, array_proj, tex_proj, true);
 
-  // Extent stuff, will be used for each component extraction
-  cudaExtent dvfExtent = make_cudaExtent(dvf_size[0], dvf_size[1], dvf_size[2]);
-
-  // Set texture parameters
-  tex_xdvf.addressMode[0] = cudaAddressModeBorder;
-  tex_xdvf.addressMode[1] = cudaAddressModeBorder;
-  tex_xdvf.addressMode[2] = cudaAddressModeBorder;
-  tex_xdvf.filterMode = cudaFilterModeLinear;
-  tex_xdvf.normalized = false; // don't access with normalized texture coords
-
-  tex_ydvf.addressMode[0] = cudaAddressModeBorder;
-  tex_ydvf.addressMode[1] = cudaAddressModeBorder;
-  tex_ydvf.addressMode[2] = cudaAddressModeBorder;
-  tex_ydvf.filterMode = cudaFilterModeLinear;
-  tex_ydvf.normalized = false;
-
-  tex_zdvf.addressMode[0] = cudaAddressModeBorder;
-  tex_zdvf.addressMode[1] = cudaAddressModeBorder;
-  tex_zdvf.addressMode[2] = cudaAddressModeBorder;
-  tex_zdvf.filterMode = cudaFilterModeLinear;
-  tex_zdvf.normalized = false;
-
-  // Allocate an intermediate memory space to extract x, y and z components of the DVF
-  float * DVFcomponent;
-  int     numel = dvf_size[0] * dvf_size[1] * dvf_size[2];
-  cudaMalloc(&DVFcomponent, numel * sizeof(float));
-  float one = 1.0;
-
-  // Allocate the arrays used for textures
-  cudaArray ** DVFcomponentArrays = new cudaArray *[3];
-  CUDA_CHECK_ERROR;
-
-  // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr.
-  // The best way to understand it is to read
-  // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api
-  for (unsigned int component = 0; component < 3; component++)
-  {
-    // Reset the intermediate memory
-    cudaMemset((void *)DVFcomponent, 0, numel * sizeof(float));
-
-    // Fill it with the current component
-    float * pComponent = dev_input_dvf + component;
-    cublasSaxpy(handle, numel, &one, pComponent, 3, DVFcomponent, 1);
-
-    // Allocate the cudaArray and fill it with the current DVFcomponent
-    cudaMalloc3DArray((cudaArray **)&DVFcomponentArrays[component], &channelDesc, dvfExtent);
-    cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms();
-    CopyParams.srcPtr = make_cudaPitchedPtr(DVFcomponent, dvf_size[0] * sizeof(float), dvf_size[0], dvf_size[1]);
-    CopyParams.dstArray = (cudaArray *)DVFcomponentArrays[component];
-    CopyParams.extent = dvfExtent;
-    CopyParams.kind = cudaMemcpyDeviceToDevice;
-    cudaMemcpy3D(&CopyParams);
-    CUDA_CHECK_ERROR;
-  }
-
-  // Intermediate memory is no longer needed
-  cudaFree(DVFcomponent);
-
-  // Bind 3D arrays to 3D textures
-  cudaBindTextureToArray(tex_xdvf, (cudaArray *)DVFcomponentArrays[0], channelDesc);
-  cudaBindTextureToArray(tex_ydvf, (cudaArray *)DVFcomponentArrays[1], channelDesc);
-  cudaBindTextureToArray(tex_zdvf, (cudaArray *)DVFcomponentArrays[2], channelDesc);
-  CUDA_CHECK_ERROR;
+  // Prepare DVF textures
+  std::vector<cudaArray *>         DVFComponentArrays;
+  std::vector<cudaTextureObject_t> tex_dvf;
+  prepareVectorTextureObject(dvf_size, dev_input_dvf, DVFComponentArrays, 3, tex_dvf, false);
 
   // Copy matrices into constant memory
   cudaMemcpyToSymbol(
@@ -329,35 +243,27 @@ CUDA_warp_back_project(int     projSize[3],
   dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
   CUDA_CHECK_ERROR;
 
-  // Bind the array of projections to a 2D layered texture
-  cudaBindTextureToArray(tex_proj, (cudaArray *)array_proj, channelDesc);
-  CUDA_CHECK_ERROR;
-
   // Note: cbi->img is passed via texture memory
   // Matrices are passed via constant memory
   //-------------------------------------
   if (radiusCylindricalDetector == 0)
-    kernel_warp_back_project_3Dgrid<<<dimGrid, dimBlock>>>(dev_vol_in, dev_vol_out);
+    kernel_warp_back_project_3Dgrid<<<dimGrid, dimBlock>>>(
+      dev_vol_in, dev_vol_out, tex_dvf[0], tex_dvf[1], tex_dvf[2], tex_proj);
   else
     kernel_warp_back_project_3Dgrid_cylindrical_detector<<<dimGrid, dimBlock>>>(
-      dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector);
-  // Unbind the image and projection matrix textures
-  cudaUnbindTexture(tex_proj);
+      dev_vol_in, dev_vol_out, (float)radiusCylindricalDetector, tex_dvf[0], tex_dvf[1], tex_dvf[2], tex_proj);
   CUDA_CHECK_ERROR;
 
-  // Unbind the image and projection matrix textures
-  cudaUnbindTexture(tex_xdvf);
-  cudaUnbindTexture(tex_ydvf);
-  cudaUnbindTexture(tex_zdvf);
-
   // Cleanup
-  cudaFreeArray((cudaArray *)DVFcomponentArrays[0]);
-  cudaFreeArray((cudaArray *)DVFcomponentArrays[1]);
-  cudaFreeArray((cudaArray *)DVFcomponentArrays[2]);
-  delete[] DVFcomponentArrays;
+  for (unsigned int c = 0; c < 3; c++)
+  {
+    cudaFreeArray(DVFComponentArrays[c]);
+    CUDA_CHECK_ERROR;
+    cudaDestroyTextureObject(tex_dvf[c]);
+    CUDA_CHECK_ERROR;
+  }
   cudaFreeArray((cudaArray *)array_proj);
   CUDA_CHECK_ERROR;
-
-  // Destroy CUBLAS context
-  cublasDestroy(handle);
+  cudaDestroyTextureObject(tex_proj);
+  CUDA_CHECK_ERROR;
 }
diff --git a/src/rtkCudaWarpForwardProjectionImageFilter.cu b/src/rtkCudaWarpForwardProjectionImageFilter.cu
index 0b7578726..094cc1f42 100644
--- a/src/rtkCudaWarpForwardProjectionImageFilter.cu
+++ b/src/rtkCudaWarpForwardProjectionImageFilter.cu
@@ -36,15 +36,9 @@
  * CUDA #includes *
  *****************/
 #include <cuda.h>
-#include <cublas_v2.h>
 #include <cuda_runtime.h>
 
-// TEXTURES AND CONSTANTS //
-texture<float, 3, cudaReadModeElementType> tex_xdvf;
-texture<float, 3, cudaReadModeElementType> tex_ydvf;
-texture<float, 3, cudaReadModeElementType> tex_zdvf;
-texture<float, 3, cudaReadModeElementType> tex_vol;
-
+// CONSTANTS
 __constant__ int3 c_projSize;
 __constant__ float3 c_boxMin;
 __constant__ float3 c_boxMax;
@@ -65,7 +59,12 @@ __constant__ float c_PPInputToIndexInputMatrix[12];
 
 // KERNEL kernel_forwardProject
 __global__ void
-kernel_warped_forwardProject(float * dev_proj_in, float * dev_proj_out)
+kernel_warped_forwardProject(float *             dev_proj_in,
+                             float *             dev_proj_out,
+                             cudaTextureObject_t tex_xdvf,
+                             cudaTextureObject_t tex_ydvf,
+                             cudaTextureObject_t tex_zdvf,
+                             cudaTextureObject_t tex_vol)
 {
   unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
   unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
@@ -122,9 +121,9 @@ kernel_warped_forwardProject(float * dev_proj_in, float * dev_proj_out)
 
         // Get each component of the displacement vector by
         // interpolation in the dvf
-        Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-        Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-        Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+        Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+        Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+        Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
 
         // Matrix multiply to get the physical coordinates of the current point in the output volume
         // + the displacement
@@ -134,7 +133,7 @@ kernel_warped_forwardProject(float * dev_proj_in, float * dev_proj_out)
         IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix);
 
         // Read from 3D texture from volume
-        sample = tex3D(tex_vol, IndexInInput.x, IndexInInput.y, IndexInInput.z);
+        sample = tex3D<float>(tex_vol, IndexInInput.x, IndexInInput.y, IndexInInput.z);
 
         // Accumulate, and move forward along the ray
         sum += sample;
@@ -172,10 +171,6 @@ CUDA_warp_forward_project(int     projSize[3],
                           float   PPInputToIndexInputMatrix[12],
                           float   IndexInputToPPInputMatrix[12])
 {
-  // Create CUBLAS context
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-
   // constant memory
   cudaMemcpyToSymbol(c_projSize, projSize, sizeof(int3));
   cudaMemcpyToSymbol(c_boxMin, box_min, sizeof(float3));
@@ -190,94 +185,15 @@ CUDA_warp_forward_project(int     projSize[3],
   // Copy the projection matrices into constant memory
   cudaMemcpyToSymbol(c_matrices, &(matrices[0]), 12 * sizeof(float) * projSize[2]);
 
-  // Prepare channel description for arrays
-  static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-
-  // Extent stuff, will be used for each component extraction
-  cudaExtent dvfExtent = make_cudaExtent(dvfSize[0], dvfSize[1], dvfSize[2]);
-
-  // Set texture parameters for the input volume
-  tex_vol.addressMode[0] = cudaAddressModeClamp; // clamp texture coordinates
-  tex_vol.addressMode[1] = cudaAddressModeClamp;
-  tex_vol.addressMode[2] = cudaAddressModeClamp;
-  tex_vol.normalized = false;                // access with normalized texture coordinates
-  tex_vol.filterMode = cudaFilterModeLinear; // linear interpolation
-
-  // Copy volume data to array, bind the array to the texture
-  cudaExtent  volExtent = make_cudaExtent(volSize[0], volSize[1], volSize[2]);
-  cudaArray * array_vol;
-  cudaMalloc3DArray((cudaArray **)&array_vol, &channelDesc, volExtent);
-  CUDA_CHECK_ERROR;
-
-  // Copy data to 3D array
-  cudaMemcpy3DParms copyParams = cudaMemcpy3DParms();
-  copyParams.srcPtr = make_cudaPitchedPtr(dev_vol, volSize[0] * sizeof(float), volSize[0], volSize[1]);
-  copyParams.dstArray = (cudaArray *)array_vol;
-  copyParams.extent = volExtent;
-  copyParams.kind = cudaMemcpyDeviceToDevice;
-  cudaMemcpy3D(&copyParams);
-  CUDA_CHECK_ERROR;
-
-  // Set texture parameters
-  tex_xdvf.addressMode[0] = cudaAddressModeBorder;
-  tex_xdvf.addressMode[1] = cudaAddressModeBorder;
-  tex_xdvf.addressMode[2] = cudaAddressModeBorder;
-  tex_xdvf.filterMode = cudaFilterModeLinear;
-  tex_xdvf.normalized = false; // don't access with normalized texture coords
-
-  tex_ydvf.addressMode[0] = cudaAddressModeBorder;
-  tex_ydvf.addressMode[1] = cudaAddressModeBorder;
-  tex_ydvf.addressMode[2] = cudaAddressModeBorder;
-  tex_ydvf.filterMode = cudaFilterModeLinear;
-  tex_ydvf.normalized = false;
-
-  tex_zdvf.addressMode[0] = cudaAddressModeBorder;
-  tex_zdvf.addressMode[1] = cudaAddressModeBorder;
-  tex_zdvf.addressMode[2] = cudaAddressModeBorder;
-  tex_zdvf.filterMode = cudaFilterModeLinear;
-  tex_zdvf.normalized = false;
-
-  // Allocate an intermediate memory space to extract x, y and z components of the DVF
-  float * DVFcomponent;
-  int     numel = dvfSize[0] * dvfSize[1] * dvfSize[2];
-  cudaMalloc(&DVFcomponent, numel * sizeof(float));
-  float one = 1.0;
-
-  // Allocate the arrays used for textures
-  cudaArray ** DVFcomponentArrays = new cudaArray *[3];
-  CUDA_CHECK_ERROR;
-
-  // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr.
-  // The best way to understand it is to read
-  // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api
-  for (unsigned int component = 0; component < 3; component++)
-  {
-    // Reset the intermediate memory
-    cudaMemset((void *)DVFcomponent, 0, numel * sizeof(float));
-
-    // Fill it with the current component
-    float * pComponent = dev_input_dvf + component;
-    cublasSaxpy(handle, numel, &one, pComponent, 3, DVFcomponent, 1);
-
-    // Allocate the cudaArray and fill it with the current DVFcomponent
-    cudaMalloc3DArray((cudaArray **)&DVFcomponentArrays[component], &channelDesc, dvfExtent);
-    cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms();
-    CopyParams.srcPtr = make_cudaPitchedPtr(DVFcomponent, dvfSize[0] * sizeof(float), dvfSize[0], dvfSize[1]);
-    CopyParams.dstArray = (cudaArray *)DVFcomponentArrays[component];
-    CopyParams.extent = dvfExtent;
-    CopyParams.kind = cudaMemcpyDeviceToDevice;
-    cudaMemcpy3D(&CopyParams);
-    CUDA_CHECK_ERROR;
-  }
-
-  // Intermediate memory is no longer needed
-  cudaFree(DVFcomponent);
+  // Prepare volume texture
+  cudaArray *         array_vol;
+  cudaTextureObject_t tex_vol;
+  prepareScalarTextureObject(volSize, dev_vol, array_vol, tex_vol, false, true, cudaAddressModeClamp);
 
-  // Bind 3D arrays to 3D textures
-  cudaBindTextureToArray(tex_xdvf, (cudaArray *)DVFcomponentArrays[0], channelDesc);
-  cudaBindTextureToArray(tex_ydvf, (cudaArray *)DVFcomponentArrays[1], channelDesc);
-  cudaBindTextureToArray(tex_zdvf, (cudaArray *)DVFcomponentArrays[2], channelDesc);
-  CUDA_CHECK_ERROR;
+  // Prepare DVF textures
+  std::vector<cudaArray *>         DVFComponentArrays;
+  std::vector<cudaTextureObject_t> tex_dvf;
+  prepareVectorTextureObject(dvfSize, dev_input_dvf, DVFComponentArrays, 3, tex_dvf, false);
 
   // Copy matrices into constant memory
   cudaMemcpyToSymbol(
@@ -292,25 +208,19 @@ CUDA_warp_forward_project(int     projSize[3],
   dim3 dimBlock = dim3(16, 16, 1);
   dim3 dimGrid = dim3(iDivUp(projSize[0], dimBlock.x), iDivUp(projSize[1], dimBlock.y));
 
-  // Bind 3D array to 3D texture
-  cudaBindTextureToArray(tex_vol, (cudaArray *)array_vol, channelDesc);
-  CUDA_CHECK_ERROR;
-
-  kernel_warped_forwardProject<<<dimGrid, dimBlock>>>(dev_proj_in, dev_proj_out);
-
-  cudaUnbindTexture(tex_xdvf);
-  cudaUnbindTexture(tex_ydvf);
-  cudaUnbindTexture(tex_zdvf);
-  cudaUnbindTexture(tex_vol);
-  CUDA_CHECK_ERROR;
+  kernel_warped_forwardProject<<<dimGrid, dimBlock>>>(
+    dev_proj_in, dev_proj_out, tex_dvf[0], tex_dvf[1], tex_dvf[2], tex_vol);
 
-  cudaFreeArray((cudaArray *)DVFcomponentArrays[0]);
-  cudaFreeArray((cudaArray *)DVFcomponentArrays[1]);
-  cudaFreeArray((cudaArray *)DVFcomponentArrays[2]);
-  delete[] DVFcomponentArrays;
+  // Cleanup
+  for (unsigned int c = 0; c < 3; c++)
+  {
+    cudaFreeArray(DVFComponentArrays[c]);
+    CUDA_CHECK_ERROR;
+    cudaDestroyTextureObject(tex_dvf[c]);
+    CUDA_CHECK_ERROR;
+  }
   cudaFreeArray((cudaArray *)array_vol);
   CUDA_CHECK_ERROR;
-
-  // Destroy CUBLAS context
-  cublasDestroy(handle);
+  cudaDestroyTextureObject(tex_vol);
+  CUDA_CHECK_ERROR;
 }
diff --git a/src/rtkCudaWarpImageFilter.cu b/src/rtkCudaWarpImageFilter.cu
index 132cbe9cb..0038ec72d 100644
--- a/src/rtkCudaWarpImageFilter.cu
+++ b/src/rtkCudaWarpImageFilter.cu
@@ -38,19 +38,6 @@
  * CUDA #includes *
  *****************/
 #include <cuda.h>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-
-// T E X T U R E S ////////////////////////////////////////////////////////
-texture<float, 1, cudaReadModeElementType> tex_IndexOutputToPPOutputMatrix;
-texture<float, 1, cudaReadModeElementType> tex_IndexOutputToIndexDVFMatrix;
-texture<float, 1, cudaReadModeElementType> tex_PPInputToIndexInputMatrix;
-
-texture<float, 3, cudaReadModeElementType> tex_xdvf;
-texture<float, 3, cudaReadModeElementType> tex_ydvf;
-texture<float, 3, cudaReadModeElementType> tex_zdvf;
-texture<float, 3, cudaReadModeElementType> tex_input_vol;
-///////////////////////////////////////////////////////////////////////////
 
 // CONSTANTS //////////////////////////////////////////////////////////////
 __constant__ float c_IndexOutputToPPOutputMatrix[12];
@@ -64,7 +51,12 @@ __constant__ float c_PPInputToIndexInputMatrix[12];
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
 
 __global__ void
-kernel_3Dgrid(float * dev_vol_out, int3 vol_dim)
+kernel_3Dgrid(float *             dev_vol_out,
+              int3                vol_dim,
+              cudaTextureObject_t tex_xdvf,
+              cudaTextureObject_t tex_ydvf,
+              cudaTextureObject_t tex_zdvf,
+              cudaTextureObject_t tex_input_vol)
 {
   unsigned int i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
   unsigned int j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
@@ -84,9 +76,9 @@ kernel_3Dgrid(float * dev_vol_out, int3 vol_dim)
   // Get each component of the displacement vector by
   // interpolation in the dvf
   float3 Displacement;
-  Displacement.x = tex3D(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-  Displacement.y = tex3D(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
-  Displacement.z = tex3D(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+  Displacement.x = tex3D<float>(tex_xdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+  Displacement.y = tex3D<float>(tex_ydvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
+  Displacement.z = tex3D<float>(tex_zdvf, IndexInDVF.x + 0.5f, IndexInDVF.y + 0.5f, IndexInDVF.z + 0.5f);
 
   // Matrix multiply to get the physical coordinates of the current point in the output volume
   float3 PP = matrix_multiply(make_float3(i, j, k), c_IndexOutputToPPOutputMatrix);
@@ -98,7 +90,8 @@ kernel_3Dgrid(float * dev_vol_out, int3 vol_dim)
   float3 IndexInInput = matrix_multiply(PP, c_PPInputToIndexInputMatrix);
 
   // Interpolate in the input and copy into the output
-  dev_vol_out[vol_idx] = tex3D(tex_input_vol, IndexInInput.x + 0.5f, IndexInInput.y + 0.5f, IndexInInput.z + 0.5f);
+  dev_vol_out[vol_idx] =
+    tex3D<float>(tex_input_vol, IndexInInput.x + 0.5f, IndexInInput.y + 0.5f, IndexInInput.z + 0.5f);
 }
 
 //_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
@@ -120,116 +113,15 @@ CUDA_warp(int     input_vol_dim[3],
           float * dev_DVF,
           bool    isLinear)
 {
-  // Prepare channel description for arrays
-  static cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-
-  // Create CUBLAS context
-  cublasHandle_t handle;
-  cublasCreate(&handle);
-
-  ///////////////////////////////////
-  // For each component of the dvf, perform a strided copy (pick every third
-  // float from dev_input_dvf) into a 3D array, and bind the array to a 3D texture
-
-  // Extent stuff, will be used for each component extraction
-  cudaExtent dvfExtent = make_cudaExtent(input_dvf_dim[0], input_dvf_dim[1], input_dvf_dim[2]);
-
-  // Set texture parameters
-  tex_xdvf.addressMode[0] = cudaAddressModeBorder;
-  tex_xdvf.addressMode[1] = cudaAddressModeBorder;
-  tex_xdvf.addressMode[2] = cudaAddressModeBorder;
-  tex_xdvf.filterMode = cudaFilterModeLinear;
-  tex_xdvf.normalized = false; // don't access with normalized texture coords
-
-  tex_ydvf.addressMode[0] = cudaAddressModeBorder;
-  tex_ydvf.addressMode[1] = cudaAddressModeBorder;
-  tex_ydvf.addressMode[2] = cudaAddressModeBorder;
-  tex_ydvf.filterMode = cudaFilterModeLinear;
-  tex_ydvf.normalized = false;
-
-  tex_zdvf.addressMode[0] = cudaAddressModeBorder;
-  tex_zdvf.addressMode[1] = cudaAddressModeBorder;
-  tex_zdvf.addressMode[2] = cudaAddressModeBorder;
-  tex_zdvf.filterMode = cudaFilterModeLinear;
-  tex_zdvf.normalized = false;
-
-  // Allocate an intermediate memory space to extract x, y and z components of the DVF
-  float * DVFcomponent;
-  int     numel = input_dvf_dim[0] * input_dvf_dim[1] * input_dvf_dim[2];
-  cudaMalloc(&DVFcomponent, numel * sizeof(float));
-  float one = 1.0;
-
-  // Allocate the arrays used for textures
-  cudaArray ** DVFcomponentArrays = new cudaArray *[3];
-  CUDA_CHECK_ERROR;
-
-  // Copy image data to arrays. The tricky part is the make_cudaPitchedPtr.
-  // The best way to understand it is to read
-  // https://stackoverflow.com/questions/16119943/how-and-when-should-i-use-pitched-pointer-with-the-cuda-api
-  for (unsigned int component = 0; component < 3; component++)
-  {
-    // Reset the intermediate memory
-    cudaMemset((void *)DVFcomponent, 0, numel * sizeof(float));
-
-    // Fill it with the current component
-    float * pComponent = dev_DVF + component;
-    cublasSaxpy(handle, numel, &one, pComponent, 3, DVFcomponent, 1);
-
-    // Allocate the cudaArray and fill it with the current DVFcomponent
-    cudaMalloc3DArray((cudaArray **)&DVFcomponentArrays[component], &channelDesc, dvfExtent);
-    cudaMemcpy3DParms CopyParams = cudaMemcpy3DParms();
-    CopyParams.srcPtr =
-      make_cudaPitchedPtr(DVFcomponent, input_dvf_dim[0] * sizeof(float), input_dvf_dim[0], input_dvf_dim[1]);
-    CopyParams.dstArray = (cudaArray *)DVFcomponentArrays[component];
-    CopyParams.extent = dvfExtent;
-    CopyParams.kind = cudaMemcpyDeviceToDevice;
-    cudaMemcpy3D(&CopyParams);
-    CUDA_CHECK_ERROR;
-  }
-
-  // Intermediate memory is no longer needed
-  cudaFree(DVFcomponent);
-
-  // Bind 3D arrays to 3D textures
-  cudaBindTextureToArray(tex_xdvf, (cudaArray *)DVFcomponentArrays[0], channelDesc);
-  cudaBindTextureToArray(tex_ydvf, (cudaArray *)DVFcomponentArrays[1], channelDesc);
-  cudaBindTextureToArray(tex_zdvf, (cudaArray *)DVFcomponentArrays[2], channelDesc);
-  CUDA_CHECK_ERROR;
-
-  ///////////////////////////////////
-  // Do the same for the input volume
-
-  // Extent stuff
-  cudaExtent volExtent = make_cudaExtent(input_vol_dim[0], input_vol_dim[1], input_vol_dim[2]);
-
-  // Set texture parameters
-  tex_input_vol.addressMode[0] = cudaAddressModeBorder;
-  tex_input_vol.addressMode[1] = cudaAddressModeBorder;
-  tex_input_vol.addressMode[2] = cudaAddressModeBorder;
-  tex_input_vol.normalized = false; // don't access with normalized texture coords
-  if (isLinear)
-    tex_input_vol.filterMode = cudaFilterModeLinear;
-  else
-    tex_input_vol.filterMode = cudaFilterModePoint;
-
-  // Allocate the array
-  cudaArray * array_input_vol;
-  cudaMalloc3DArray((cudaArray **)&array_input_vol, &channelDesc, volExtent);
-  CUDA_CHECK_ERROR;
+  // Prepare DVF textures
+  std::vector<cudaArray *>         DVFComponentArrays;
+  std::vector<cudaTextureObject_t> tex_dvf;
+  prepareVectorTextureObject(input_dvf_dim, dev_DVF, DVFComponentArrays, 3, tex_dvf, false);
 
-  // Copy image data to array
-  cudaMemcpy3DParms inputCopyParams = cudaMemcpy3DParms();
-  inputCopyParams.srcPtr =
-    make_cudaPitchedPtr(dev_input_vol, input_vol_dim[0] * sizeof(float), input_vol_dim[0], input_vol_dim[1]);
-  inputCopyParams.dstArray = (cudaArray *)array_input_vol;
-  inputCopyParams.extent = volExtent;
-  inputCopyParams.kind = cudaMemcpyDeviceToDevice;
-  cudaMemcpy3D(&inputCopyParams);
-  CUDA_CHECK_ERROR;
-
-  // Bind 3D arrays to 3D textures
-  cudaBindTextureToArray(tex_input_vol, (cudaArray *)array_input_vol, channelDesc);
-  CUDA_CHECK_ERROR;
+  // Prepare volume texture
+  cudaArray *         array_input_vol;
+  cudaTextureObject_t tex_input_vol;
+  prepareScalarTextureObject(output_vol_dim, dev_input_vol, array_input_vol, tex_input_vol, false, isLinear);
 
   // Copy matrices into constant memory
   cudaMemcpyToSymbol(
@@ -238,12 +130,7 @@ CUDA_warp(int     input_vol_dim[3],
     c_IndexOutputToIndexDVFMatrix, IndexOutputToIndexDVFMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice);
   cudaMemcpyToSymbol(
     c_PPInputToIndexInputMatrix, PPInputToIndexInputMatrix, 12 * sizeof(float), 0, cudaMemcpyHostToDevice);
-
-  //////////////////////////////////////
-  /// Run
-
-  int device;
-  cudaGetDevice(&device);
+  CUDA_CHECK_ERROR;
 
   // Thread Block Dimensions
   constexpr int tBlock_x = 16;
@@ -258,29 +145,24 @@ CUDA_warp(int     input_vol_dim[3],
   dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
   dim3 dimBlock = dim3(tBlock_x, tBlock_y, tBlock_z);
 
-  // Note: the DVF and input image are passed via texture memory
-  //-------------------------------------
   kernel_3Dgrid<<<dimGrid, dimBlock>>>(dev_output_vol,
-                                       make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]));
-
-  CUDA_CHECK_ERROR;
-
-  // Unbind the image and projection matrix textures
-  cudaUnbindTexture(tex_xdvf);
-  cudaUnbindTexture(tex_ydvf);
-  cudaUnbindTexture(tex_zdvf);
-  cudaUnbindTexture(tex_input_vol);
+                                       make_int3(output_vol_dim[0], output_vol_dim[1], output_vol_dim[2]),
+                                       tex_dvf[0],
+                                       tex_dvf[1],
+                                       tex_dvf[2],
+                                       tex_input_vol);
   CUDA_CHECK_ERROR;
 
   // Cleanup
+  for (unsigned int c = 0; c < 3; c++)
+  {
+    cudaFreeArray(DVFComponentArrays[c]);
+    CUDA_CHECK_ERROR;
+    cudaDestroyTextureObject(tex_dvf[c]);
+    CUDA_CHECK_ERROR;
+  }
+  cudaFreeArray(array_input_vol);
   CUDA_CHECK_ERROR;
-  cudaFreeArray((cudaArray *)DVFcomponentArrays[0]);
-  cudaFreeArray((cudaArray *)DVFcomponentArrays[1]);
-  cudaFreeArray((cudaArray *)DVFcomponentArrays[2]);
-  cudaFreeArray((cudaArray *)array_input_vol);
-  delete[] DVFcomponentArrays;
+  cudaDestroyTextureObject(tex_input_vol);
   CUDA_CHECK_ERROR;
-
-  // Destroy CUBLAS context
-  cublasDestroy(handle);
 }
diff --git a/src/rtkCudaWeidingerForwardModelImageFilter.cu b/src/rtkCudaWeidingerForwardModelImageFilter.cu
index cf3522a63..e03c8dc8e 100644
--- a/src/rtkCudaWeidingerForwardModelImageFilter.cu
+++ b/src/rtkCudaWeidingerForwardModelImageFilter.cu
@@ -38,7 +38,6 @@
  * CUDA #includes *
  *****************/
 #include <cuda.h>
-#include <cublas_v2.h>
 #include <cuda_runtime.h>
 
 #define IDX2D(r, c, cols) ((r) * (cols) + (c))