Improved iq raw 2 LRI performance.

Moved the probe geometry to constant memory, changed kernel grid of threads dimensions.
us4useu · Nov 12, 2021 · 906f931 · 906f931
1 parent 02d71df
commit 906f931
Show file tree

Hide file tree

Showing 2 changed files with 122 additions and 99 deletions.
diff --git a/api/python/arrus/utils/imaging.py b/api/python/arrus/utils/imaging.py
@@ -1435,18 +1435,21 @@ def __init__(self, x_grid, z_grid, rx_tang_limits=None):
 
     def set_pkgs(self, num_pkg, **kwargs):
         if num_pkg is np:
-            raise ValueError("Currently reconstructLri operation is "
-                             "implemented for GPU only.")
+            raise ValueError("ReconstructLri operation is implemented for GPU only.")
 
     def prepare(self, const_metadata):
         from pathlib import Path
         import os
+        import cupy as cp
+
         current_dir = os.path.dirname(os.path.join(os.path.abspath(__file__)))
         _kernel_source = Path(os.path.join(current_dir, "iq_raw_2_lri.cu")).read_text()
-        self._kernel = self.num_pkg.RawKernel(_kernel_source, "iqRaw2Lri")
+        self._kernel_module = self.num_pkg.RawModule(code=_kernel_source)
+        self._kernel = self._kernel_module.get_function("iqRaw2Lri")
+        self._z_elem_const = self._kernel_module.get_global("zElemConst")
+        self._tang_elem_const = self._kernel_module.get_global("tangElemConst")
 
         # INPUT PARAMETERS.
-
         # Input data shape.
         self.n_tx, self.n_rx, self.n_samples = const_metadata.input_shape
 
@@ -1461,10 +1464,13 @@ def prepare(self, const_metadata):
         self.output_buffer = self.num_pkg.zeros(output_shape, dtype=self.num_pkg.complex64)
         x_block_size = min(self.x_size, 16)
         z_block_size = min(self.z_size, 16)
-        self.block_size = (z_block_size, x_block_size, 1)
+        tx_block_size = min(self.n_tx, 4)
+        self.block_size = (z_block_size, x_block_size, tx_block_size)
         self.grid_size = (int((self.z_size-1)//z_block_size + 1),
                           int((self.x_size-1)//x_block_size + 1),
-                          self.n_tx)
+                          int((self.n_tx-1)//tx_block_size + 1))
+        print(self.block_size)
+        print(self.grid_size)
         self.x_pix = self.num_pkg.asarray(self.x_grid, dtype=self.num_pkg.float32)
         self.z_pix = self.num_pkg.asarray(self.z_grid, dtype=self.num_pkg.float32)
 
@@ -1478,11 +1484,23 @@ def prepare(self, const_metadata):
         element_pos_x = probe_model.element_pos_x
         element_pos_z = probe_model.element_pos_z
         element_angle_tang = np.tan(probe_model.element_angle)
-        self.x_elem = self.num_pkg.asarray(element_pos_x, dtype=self.num_pkg.float32)
-        self.z_elem = self.num_pkg.asarray(element_pos_z, dtype=self.num_pkg.float32)
-        self.tang_elem = self.num_pkg.asarray(element_angle_tang, dtype=self.num_pkg.float32)
+
         self.n_elements = probe_model.n_elements
 
+        device_props = cp.cuda.runtime.getDeviceProperties(0)
+        if device_props["totalConstMem"] < 256*3*4: # 3 float32 arrays, 256 elements max
+            raise ValueError("There is not enough constant memory available!")
+
+        x_elem = np.asarray(element_pos_x, dtype=self.num_pkg.float32)
+        self._x_elem_const = _get_const_memory_array(
+            self._kernel_module, name="xElemConst", input_array=x_elem)
+        z_elem = np.asarray(element_pos_z, dtype=self.num_pkg.float32)
+        self._z_elem_const = _get_const_memory_array(
+            self._kernel_module, name="zElemConst", input_array=z_elem)
+        tang_elem = np.asarray(element_angle_tang, dtype=self.num_pkg.float32)
+        self._tang_elem_const = _get_const_memory_array(
+            self._kernel_module, name="tangElemConst", input_array=tang_elem)
+
         # TX aperture description
         # Convert the sequence to the positions of the aperture centers
         tx_rx_params = arrus.kernels.imaging.compute_tx_rx_params(
@@ -1530,8 +1548,7 @@ def process(self, data):
         params = (
             self.output_buffer,
             data,
-            self.x_elem, self.z_elem, self.tang_elem, self.n_elements, # DONE
-            self.n_tx, self.n_samples,
+            self.n_elements, self.n_tx, self.n_samples,
             self.z_pix, self.z_size,
             self.x_pix, self.x_size,
             self.sos, self.fs, self.fn,
@@ -1757,3 +1774,11 @@ def process(self, data):
         self._remap_fn(data)
         return self._output_buffer
 
+
+def _get_const_memory_array(module, name, input_array):
+    import cupy as cp
+    const_arr_ptr = module.get_global(name)
+    const_arr = cp.ndarray(shape=input_array.shape, dtype=input_array.dtype,
+                           memptr=const_arr_ptr)
+    const_arr.set(input_array)
+    return const_arr
diff --git a/api/python/arrus/utils/iq_raw_2_lri.cu b/api/python/arrus/utils/iq_raw_2_lri.cu
@@ -2,11 +2,14 @@
 
 #define CUDART_PI_F 3.141592654f
 
+__constant__ float zElemConst[256];
+__constant__ float xElemConst[256];
+__constant__ float tangElemConst[256];
+
 extern "C"
 __global__ void
 iqRaw2Lri(complex<float> *iqLri, const complex<float> *iqRaw,
-          const float *xElem, const float *zElem, const float *tangElem, const int nElem,
-          const int nTx, const int nSamp,
+          const int nElem, const int nTx, const int nSamp,
           const float *zPix, const int nZPix,
           const float *xPix, const int nXPix,
           float const sos, float const fs, float const fn,
@@ -19,8 +22,9 @@ iqRaw2Lri(complex<float> *iqLri, const complex<float> *iqRaw,
 
     int z = blockIdx.x * blockDim.x + threadIdx.x;
     int x = blockIdx.y * blockDim.y + threadIdx.y;
+    int iTx = blockIdx.z * blockDim.z + threadIdx.z;
 
-    if (z >= nZPix || x >= nXPix) {
+    if(z >= nZPix || x >= nXPix || iTx >= nTx) {
         return;
     }
 
@@ -36,93 +40,87 @@ iqRaw2Lri(complex<float> *iqLri, const complex<float> *iqRaw,
     const float centRxTang = (maxRxTang + minRxTang) * 0.5f;
     complex<float> pix(0.0f, 0.0f), samp(0.0f, 0.0f), modFactor;
 
-    for (int iTx = 0; iTx < nTx; ++iTx) {
-        int txOffset = iTx*nSamp*nRx;
-
-        if (!isinf(txFoc[iTx])) {
-            /* STA */
-            float zFoc = txApCentZ[iTx] + txFoc[iTx]*cosf(txAngZX[iTx]);
-            float xFoc = txApCentX[iTx] + txFoc[iTx]*sinf(txAngZX[iTx]);
-
-            float pixFocArrang;
-
-            if (txFoc[iTx] <= 0.0f) {
-                /* Virtual Point Source BEHIND probe surface */
-                // Valid pixels are assumed to be always in front of the focal point (VSP)
-                pixFocArrang = 1.0f;
-            }
-            else {
-                /* Virtual Point Source IN FRONT OF probe surface */
-                // Projection of the Foc-Pix vector on the ApCent-Foc vector (dot product) ...
-                // to determine if the pixel is behind (-) or in front of (+) the focal point (VSP).
-                pixFocArrang = (((zPix[z]-zFoc)*(zFoc-txApCentZ[iTx]) +
-                                 (xPix[x]-xFoc)*(xFoc-txApCentX[iTx])) >= 0.f) ? 1.f : -1.f;
-            }
-            txDist	= hypotf(zPix[z] - zFoc, xPix[x] - xFoc);
-            txDist *= pixFocArrang; // Compensation for the Pix-Foc arrangement
-            txDist += txFoc[iTx]; // Compensation for the reference time being the moment when txApCent fires.
-
-            // Projections of Foc-Pix vector on the rotated Foc-ApEdge vectors (dot products) ...
-            // to determine if the pixel is in the sonified area (dot product >= 0).
-            // Foc-ApEdgeFst vector is rotated left, Foc-ApEdgeLst vector is rotated right.
-            txApod = ( ( (-(xElem[txApFstElem[iTx]] - xFoc)*(zPix[z] - zFoc) +
-                          (zElem[txApFstElem[iTx]] - zFoc)*(xPix[x] - xFoc))*pixFocArrang >= 0.f ) &&
-                       ( ( (xElem[txApLstElem[iTx]] - xFoc)*(zPix[z] - zFoc) -
-                           (zElem[txApLstElem[iTx]] - zFoc)*(xPix[x] - xFoc))*pixFocArrang >= 0.f ) ) ? 1.f : 0.f;
-        }
-        else {
-            /* PWI */
-            txDist = (zPix[z] - txApCentZ[iTx]) * cosf(txAngZX[iTx]) +
-                     (xPix[x] - txApCentX[iTx]) * sinf(txAngZX[iTx]);
-
-            // Projections of ApEdge-Pix vector on the rotated unit vector of tx direction (dot products) ...
-            // to determine if the pixel is in the sonified area (dot product >= 0).
-            // For ApEdgeFst, the vector is rotated left, for ApEdgeLst the vector is rotated right.
-            txApod = (((-(zPix[z] - zElem[txApFstElem[iTx]])*sinf(txAngZX[iTx]) +
-                        (xPix[x] - xElem[txApFstElem[iTx]])*cosf(txAngZX[iTx])) >= 0.f) &&
-                      (((zPix[z] - zElem[txApLstElem[iTx]])*sinf(txAngZX[iTx]) -
-                        (xPix[x] - xElem[txApLstElem[iTx]])*cosf(txAngZX[iTx])) >= 0.f)) ? 1.f : 0.f;
+    int txOffset = iTx * nSamp * nRx;
+
+    if(!isinf(txFoc[iTx])) {
+        /* STA */
+        float zFoc = txApCentZ[iTx] + txFoc[iTx] * cosf(txAngZX[iTx]);
+        float xFoc = txApCentX[iTx] + txFoc[iTx] * sinf(txAngZX[iTx]);
+
+        float pixFocArrang;
+
+        if(txFoc[iTx] <= 0.0f) {
+            /* Virtual Point Source BEHIND probe surface */
+            // Valid pixels are assumed to be always in front of the focal point (VSP)
+            pixFocArrang = 1.0f;
+        } else {
+            /* Virtual Point Source IN FRONT OF probe surface */
+            // Projection of the Foc-Pix vector on the ApCent-Foc vector (dot product) ...
+            // to determine if the pixel is behind (-) or in front of (+) the focal point (VSP).
+            pixFocArrang = (((zPix[z] - zFoc) * (zFoc - txApCentZ[iTx]) +
+                             (xPix[x] - xFoc) * (xFoc - txApCentX[iTx])) >= 0.f) ? 1.f : -1.f;
         }
-        pixWgh = 0.0f;
-        pix.real(0.0f);
-        pix.imag(0.0f);
-
-        if (txApod != 0.0f) {
-            for (int iRx = 0; iRx < nRx; iRx++) {
-                iElem = iRx + rxApOrigElem[iTx];
-                if (iElem < 0 || iElem >= nElem) continue;
-
-                rxDist = hypotf(xPix[x] - xElem[iElem], zPix[z] - zElem[iElem]);
-                rxTang = __fdividef(xPix[x] - xElem[iElem], zPix[z] - zElem[iElem]);
-                rxTang = __fdividef(rxTang - tangElem[iElem], 1.f + rxTang*tangElem[iElem]);
-                if (rxTang < minRxTang || rxTang > maxRxTang) continue;
-
-                rxApod = (rxTang - centRxTang) * rngRxTangInv;
-                rxApod = __expf(-rxApod * rxApod * twoSigSqrInv);
-
-                time = (txDist + rxDist)*sosInv + initDel;
-                iSamp = time * fs;
-                if (iSamp < 0.0f || iSamp >= static_cast<float>(nSamp - 1)) {
-                    continue;
-                }
-                offset = txOffset + iRx*nSamp;
-                interpWgh = modff(iSamp, &iSamp);
-                int intSamp = int(iSamp);
-
-                __sincosf(omega*time, &modSin, &modCos);
-                complex<float> modFactor = complex<float>(modCos, modSin);
-
-                samp = iqRaw[offset+intSamp]*(1-interpWgh) + iqRaw[offset+intSamp+1]*interpWgh;
-                pix += samp*modFactor*rxApod;
-                pixWgh += rxApod;
+        txDist = hypotf(zPix[z] - zFoc, xPix[x] - xFoc);
+        txDist *= pixFocArrang; // Compensation for the Pix-Foc arrangement
+        txDist += txFoc[iTx]; // Compensation for the reference time being the moment when txApCent fires.
+
+        // Projections of Foc-Pix vector on the rotated Foc-ApEdge vectors (dot products) ...
+        // to determine if the pixel is in the sonified area (dot product >= 0).
+        // Foc-ApEdgeFst vector is rotated left, Foc-ApEdgeLst vector is rotated right.
+        txApod = (((-(xElemConst[txApFstElem[iTx]] - xFoc) * (zPix[z] - zFoc) +
+                    (zElemConst[txApFstElem[iTx]] - zFoc) * (xPix[x] - xFoc)) * pixFocArrang >= 0.f) &&
+                  (((xElemConst[txApLstElem[iTx]] - xFoc) * (zPix[z] - zFoc) -
+                    (zElemConst[txApLstElem[iTx]] - zFoc) * (xPix[x] - xFoc)) * pixFocArrang >= 0.f)) ? 1.f : 0.f;
+    } else {
+        /* PWI */
+        txDist = (zPix[z] - txApCentZ[iTx]) * cosf(txAngZX[iTx]) +
+                 (xPix[x] - txApCentX[iTx]) * sinf(txAngZX[iTx]);
+
+        // Projections of ApEdge-Pix vector on the rotated unit vector of tx direction (dot products) ...
+        // to determine if the pixel is in the sonified area (dot product >= 0).
+        // For ApEdgeFst, the vector is rotated left, for ApEdgeLst the vector is rotated right.
+        txApod = (((-(zPix[z] - zElemConst[txApFstElem[iTx]]) * sinf(txAngZX[iTx]) +
+                    (xPix[x] - xElemConst[txApFstElem[iTx]]) * cosf(txAngZX[iTx])) >= 0.f) &&
+                  (((zPix[z] - zElemConst[txApLstElem[iTx]]) * sinf(txAngZX[iTx]) -
+                    (xPix[x] - xElemConst[txApLstElem[iTx]]) * cosf(txAngZX[iTx])) >= 0.f)) ? 1.f : 0.f;
+    }
+    pixWgh = 0.0f;
+    pix.real(0.0f);
+    pix.imag(0.0f);
+
+    if(txApod != 0.0f) {
+        for(int iRx = 0; iRx < nRx; iRx++) {
+            iElem = iRx + rxApOrigElem[iTx];
+            if(iElem < 0 || iElem >= nElem) continue;
+
+            rxDist = hypotf(xPix[x] - xElemConst[iElem], zPix[z] - zElemConst[iElem]);
+            rxTang = __fdividef(xPix[x] - xElemConst[iElem], zPix[z] - zElemConst[iElem]);
+            rxTang = __fdividef(rxTang - tangElemConst[iElem], 1.f + rxTang * tangElemConst[iElem]);
+            if(rxTang < minRxTang || rxTang > maxRxTang) continue;
+
+            rxApod = (rxTang - centRxTang) * rngRxTangInv;
+            rxApod = __expf(-rxApod * rxApod * twoSigSqrInv);
+
+            time = (txDist + rxDist) * sosInv + initDel;
+            iSamp = time * fs;
+            if(iSamp < 0.0f || iSamp >= static_cast<float>(nSamp - 1)) {
+                continue;
             }
-        }
-        if(pixWgh == 0.0f) {
-            iqLri[z + x*nZPix + iTx*nZPix*nXPix] = complex<float>(0.0f, 0.0f);
-        }
-        else {
-            iqLri[z + x * nZPix + iTx * nZPix * nXPix] = pix/pixWgh*txApod;
+            offset = txOffset + iRx * nSamp;
+            interpWgh = modff(iSamp, &iSamp);
+            int intSamp = int(iSamp);
+
+            __sincosf(omega * time, &modSin, &modCos);
+            complex<float> modFactor = complex<float>(modCos, modSin);
+
+            samp = iqRaw[offset + intSamp] * (1 - interpWgh) + iqRaw[offset + intSamp + 1] * interpWgh;
+            pix += samp * modFactor * rxApod;
+            pixWgh += rxApod;
         }
     }
-
+    if(pixWgh == 0.0f) {
+        iqLri[z + x * nZPix + iTx * nZPix * nXPix] = complex<float>(0.0f, 0.0f);
+    } else {
+        iqLri[z + x * nZPix + iTx * nZPix * nXPix] = pix / pixWgh * txApod;
+    }
 }