From 440c08a43531430246adccaf75b6b1fe2c728d8f Mon Sep 17 00:00:00 2001
From: Piotr Jarosik <piotr.jarosik@us4us.eu>
Date: Sun, 3 Apr 2022 15:04:38 +0200
Subject: [PATCH] Fixed TX apodization for CUDA kernel for 3D.

---
 api/python/arrus/utils/imaging.py         | 107 +++++++++++++---------
 api/python/arrus/utils/iq_raw_2_lri_3d.cu |  31 ++++---
 2 files changed, 83 insertions(+), 55 deletions(-)

diff --git a/api/python/arrus/utils/imaging.py b/api/python/arrus/utils/imaging.py
index 8482099f4..bb872dada 100644
--- a/api/python/arrus/utils/imaging.py
+++ b/api/python/arrus/utils/imaging.py
@@ -2175,10 +2175,7 @@ class ReconstructLri3D(Operation):
     """
 
     def __init__(self, x_grid, y_grid, z_grid, tx_foc, tx_ang_zx, tx_ang_zy,
-                 tx_ap_cent_x, tx_ap_cent_y,
                  speed_of_sound, rx_tang_limits=None):
-        self.tx_ap_cent_y = tx_ap_cent_y
-        self.tx_ap_cent_x = tx_ap_cent_x
         self.tx_ang_zy = tx_ang_zy
         self.tx_ang_zx = tx_ang_zx
         self.tx_foc = tx_foc
@@ -2194,6 +2191,17 @@ def set_pkgs(self, num_pkg, **kwargs):
         if num_pkg is np:
             raise ValueError("ReconstructLri3D operation is implemented for GPU only.")
 
+    def _get_aperture_boundaries(self, apertures):
+        def get_min_max_x_y(ap):
+            cords = np.argwhere(ap)
+            y, x = zip(*cords)
+            return np.min(x), np.max(x), np.min(y), np.max(y)
+        min_max_x_y = (get_min_max_x_y(aperture) for aperture in apertures)
+        min_x, max_x, min_y, max_y = zip(*min_max_x_y)
+        min_x, max_x = np.atleast_1d(min_x), np.atleast_1d(max_x)
+        min_y, max_y = np.atleast_1d(min_y), np.atleast_1d(max_y)
+        return min_x, max_x, min_y, max_y
+
     def prepare(self, const_metadata):
         import cupy as cp
 
@@ -2220,7 +2228,7 @@ def prepare(self, const_metadata):
         self.y_size = len(self.y_grid)
         self.x_size = len(self.x_grid)
         self.z_size = len(self.z_grid)
-        output_shape = (self.n_seq, self.n_tx, self.y_size, self.x_size, self.z_size)
+        output_shape = (self.n_seq, self.y_size, self.x_size, self.z_size)
         self.output_buffer = self.num_pkg.zeros(output_shape, dtype=self.num_pkg.complex64)
         x_block_size = min(self.z_size, 8)
         y_block_size = min(self.x_size, 8)
@@ -2242,26 +2250,20 @@ def prepare(self, const_metadata):
         self.tx_ang_zx = self.num_pkg.asarray(self.tx_ang_zx).astype(np.float32)
         self.tx_ang_zy = self.num_pkg.asarray(self.tx_ang_zy).astype(np.float32)
 
-        # Could be determined automatically based on
-        self.tx_ap_cent_x = self.num_pkg.asarray(self.tx_ap_cent_x).astype(np.float32)
-        self.tx_ap_cent_y = self.num_pkg.asarray(self.tx_ap_cent_y).astype(np.float32)
-
         # Probe description
         # TODO specific for Vermon mat-3d probe.
         probe_model = const_metadata.context.device.probe.model
         pitch = 0.3e-3 # probe_model.pitch
         self.n_elements = 32
+        n_rows_x = self.n_elements
+        n_rows_y = self.n_elements + 3
         # General regular position of elements
-        element_pos = np.arange(-(self.n_elements - 1)/2, self.n_elements/2)
-        element_pos_x = element_pos*pitch
-        element_pos_y = np.zeros_like(element_pos)
-        # E.g. rows
-        # 0:8 -> 0:8
-        # 8:16 -> 9:17 (+ 1 offset)
-        # 16:24 -> 18:26 (+2 offset)
-        # 24:32 -> 27:35 (+ 3 offset)
-        for i in range(4):
-            element_pos_y[i*8:(i+1)*8] = (element_pos[i*8:(i+1)*8]+i)*pitch
+        element_pos_x = np.linspace(-(n_rows_x - 1)/2, (n_rows_x - 1)/2, num=n_rows_x)
+        element_pos_x = element_pos_x*pitch
+
+        element_pos_y = np.linspace(-(n_rows_y - 1)/2, (n_rows_y - 1)/2, num=n_rows_y)
+        element_pos_y = element_pos_y*pitch
+        element_pos_y = np.delete(element_pos_y, (8, 17, 26))
 
         element_pos_x = element_pos_x.astype(np.float32)
         element_pos_y = element_pos_y.astype(np.float32)
@@ -2282,24 +2284,44 @@ def get_min_max_x_y(aperture):
             return np.min(x), np.max(x), np.min(y), np.max(y)
 
         # TODO assumption, that probe has the same number elements in both dimensions
-        apertures = (tx_rx.tx.aperture.reshape((self.n_elements, self.n_elements)) for tx_rx in seq.ops)
-        min_max_x_y = (get_min_max_x_y(aperture) for aperture in apertures)
-        min_x, max_x, min_y, max_y = zip(*min_max_x_y)
-        min_x, max_x = np.atleast_1d(min_x), np.atleast_1d(max_x)
-        min_y, max_y = np.atleast_1d(min_y), np.atleast_1d(max_y)
-        self.tx_ap_first_elem_x = self.num_pkg.asarray(min_x, dtype=self.num_pkg.int32)
-        self.tx_ap_last_elem_x = self.num_pkg.asarray(max_x, dtype=self.num_pkg.int32)
-        self.tx_ap_first_elem_y = self.num_pkg.asarray(min_y, dtype=self.num_pkg.int32)
-        self.tx_ap_last_elem_y = self.num_pkg.asarray(max_y, dtype=self.num_pkg.int32)
-
-        # Find the tx_center_delay
+        tx_apertures = (tx_rx.tx.aperture.reshape((self.n_elements, self.n_elements)) for tx_rx in seq.ops)
+        rx_apertures = (tx_rx.rx.aperture.reshape((self.n_elements, self.n_elements)) for tx_rx in seq.ops)
+        tx_bounds = self._get_aperture_boundaries(tx_apertures)
+        rx_bounds = self._get_aperture_boundaries(rx_apertures)
+        txap_min_x, txap_max_x, txap_min_y, txap_max_y = tx_bounds
+        rxap_min_x, rxap_max_x, rxap_min_y, rxap_max_y = rx_bounds
+        rxap_size_x = set((rxap_max_x-rxap_min_x).tolist())
+        rxap_size_y = set((rxap_max_y-rxap_min_y).tolist())
+        if len(rxap_size_x) > 1 or len(rxap_size_y) > 1:
+            raise ValueError("Each TX/RX aperture should have the same square aperture size.")
+        rxap_size_x = next(iter(rxap_size_x))
+        rxap_size_y = next(iter(rxap_size_y))
+        # The above can be also compared with the size of data, but right we are not doing it here
+
+        self.tx_ap_first_elem_x = self.num_pkg.asarray(txap_min_x, dtype=self.num_pkg.int32)
+        self.tx_ap_last_elem_x = self.num_pkg.asarray(txap_max_x, dtype=self.num_pkg.int32)
+        self.tx_ap_first_elem_y = self.num_pkg.asarray(txap_min_y, dtype=self.num_pkg.int32)
+        self.tx_ap_last_elem_y = self.num_pkg.asarray(txap_max_y, dtype=self.num_pkg.int32)
+        # RX AP
+        self.rx_ap_first_elem_x = self.num_pkg.asarray(rxap_min_x, dtype=self.num_pkg.int32)
+        self.rx_ap_first_elem_y = self.num_pkg.asarray(rxap_min_y, dtype=self.num_pkg.int32)
+
+        # Find the center of TX aperture.
+        # TODO note: this method assumes that all TX/RXs have a rectangle TX aperture
         # 1. Find the position of the center.
-        ap_center_x = (element_pos_x[min_x] + element_pos_x[max_x])/2
-        ap_center_y = (element_pos_y[min_y] + element_pos_y[max_y])/2
-        ap_center_elem_x = np.interp(ap_center_x, element_pos_x, np.arange(len(element_pos_x)))
-        ap_center_elem_y = np.interp(ap_center_y, element_pos_y, np.arange(len(element_pos_y)))
-        ap_center_elem_x = np.round(ap_center_elem_x).astype(np.int32)
-        ap_center_elem_y = np.round(ap_center_elem_y).astype(np.int32)
+        tx_ap_center_x = (element_pos_x[txap_min_x] + element_pos_x[txap_max_x])/2
+        tx_ap_center_y = (element_pos_y[txap_min_y] + element_pos_y[txap_max_y])/2
+        # element index -> element position
+        ap_center_elem_x = np.interp(tx_ap_center_x, element_pos_x, np.arange(len(element_pos_x)))
+        ap_center_elem_y = np.interp(tx_ap_center_y, element_pos_y, np.arange(len(element_pos_y)))
+        # TODO Currently 'floor' NN, consider interpolating into
+        #  the center delay
+        ap_center_elem_x = np.floor(ap_center_elem_x).astype(np.int32)
+        ap_center_elem_y = np.floor(ap_center_elem_y).astype(np.int32)
+        self.tx_ap_cent_x = self.num_pkg.asarray(tx_ap_center_x).astype(np.float32)
+        self.tx_ap_cent_y = self.num_pkg.asarray(tx_ap_center_y).astype(np.float32)
+
+        # FIND THE TX_CENTER_DELAY
         # Make sure, that for all TX/RXs:
         # The center element is in the aperture
         # All TX/RX have (almost) the same delay in the aperture's center
@@ -2309,15 +2331,16 @@ def get_min_max_x_y(aperture):
             tx_center_x = ap_center_elem_x[i]
             tx_center_y = ap_center_elem_y[i]
             aperture = tx.aperture.reshape((self.n_elements, self.n_elements))
-            # TODO the below will work only for full aperture transmits
-            delays = tx.delays.reshape((self.n_elements, self.n_elements))
+            delays = np.zeros(aperture.shape)
+            delays[:] = np.nan
+            delays[np.where(aperture)] = tx.delays.flatten()
             if not aperture[tx_center_y, tx_center_x]:
                 # The aperture's center should transmit signal
                 raise ValueError("TX aperture center should be turned on.")
             if tx_center_delay is None:
                 tx_center_delay = delays[tx_center_y, tx_center_x]
             else:
-                # Make sure that the center' delays is the same for all TX/RXs
+                # Make sure that the center' delays is the same position for all TX/RXs
                 current_center_delay = delays[tx_center_y, tx_center_x]
                 if not np.isclose(tx_center_delay, current_center_delay):
                     raise ValueError(f"TX/RX {i}: center delay is not close "
@@ -2327,7 +2350,7 @@ def get_min_max_x_y(aperture):
                                      f"Center delays should be equalized "
                                      f"for all TX/RXs. ")
 
-        # Min/max tang
+        # MIN/MAX TANG
         if self.rx_tang_limits is not None:
             self.min_tang, self.max_tang = self.rx_tang_limits
         else:
@@ -2338,7 +2361,6 @@ def get_min_max_x_y(aperture):
         burst_factor = ref_tx.excitation.n_periods / (2*self.fn)
         self.initial_delay = -start_sample/65e6+burst_factor+tx_center_delay
         self.initial_delay = self.num_pkg.float32(self.initial_delay)
-
         self.rx_apod = scipy.signal.windows.hamming(20).astype(np.float32)
         self.rx_apod = self.num_pkg.asarray(self.rx_apod)
         self.n_rx_apod = self.num_pkg.int32(len(self.rx_apod))
@@ -2361,7 +2383,8 @@ def process(self, data):
             self.min_tang, self.max_tang,
             self.min_tang, self.max_tang,
             self.initial_delay,
-            self.rx_apod, self.n_rx_apod
+            self.rx_apod, self.n_rx_apod,
+            self.rx_ap_first_elem_x, self.rx_ap_first_elem_y
         )
         self._kernel(self.grid_size, self.block_size, params)
-        return self.output_buffer
\ No newline at end of file
+        return self.output_buffer
diff --git a/api/python/arrus/utils/iq_raw_2_lri_3d.cu b/api/python/arrus/utils/iq_raw_2_lri_3d.cu
index 742c7a699..4341275a7 100644
--- a/api/python/arrus/utils/iq_raw_2_lri_3d.cu
+++ b/api/python/arrus/utils/iq_raw_2_lri_3d.cu
@@ -42,8 +42,11 @@ __global__ void iqRaw2Lri3D(complex<float> *iqLri, const complex<float> *input,
                             const float minRxTangZX, const float maxRxTangZX,
                             const float minRxTangZY, const float maxRxTangZY,
                             const float initDel,
-                            const float *rxApod, const int nRxApod) {
-
+                            const float *rxApod, const int nRxApod,
+    // list of positions (x, y) of the first element of rx aperture
+    // (assuming rectangle aperture, the first element is the one in the top left corner)
+                            const int *rxApFstElemX, const int *rxApFstElemY
+) {
     int z = blockIdx.x * blockDim.x + threadIdx.x;
     int x = blockIdx.y * blockDim.y + threadIdx.y;
     int y = blockIdx.z * blockDim.z + threadIdx.z;
@@ -97,11 +100,11 @@ __global__ void iqRaw2Lri3D(complex<float> *iqLri, const complex<float> *input,
             // to determine if the pixel is in the sonified area (dot product >= 0).
             // Foc-ApEdgeFst vector is rotated left, Foc-ApEdgeLst vector is rotated right.
             txApod = (
-                    ((-(zPix[z] - zFoc)*(xElemConst[txApFstElemX[iTx]] - xFoc) - (xPix[x] - xFoc)*zFoc) * pixFocArrang >= 0.f) &&
-                    ( ((zPix[z] - zFoc)*(xElemConst[txApLstElemX[iTx]] - xFoc) + (xPix[x] - xFoc)*zFoc) * pixFocArrang >= 0.f) &&
-                    ((-(zPix[z] - zFoc)*(yElemConst[txApFstElemY[iTx]] - yFoc) - (yPix[y] - yFoc)*zFoc) * pixFocArrang >= 0.f) &&
-                    (( (zPix[z] - zFoc)*(yElemConst[txApLstElemY[iTx]] - yFoc) + (yPix[y] - yFoc)*zFoc) * pixFocArrang >= 0.f)
-                    ) ? 1.f : 0.f;
+                         ((-(zPix[z] - zFoc)*(xElemConst[txApFstElemX[iTx]] - xFoc) - (xPix[x] - xFoc)*zFoc) * pixFocArrang >= 0.f) &&
+                             ( ((zPix[z] - zFoc)*(xElemConst[txApLstElemX[iTx]] - xFoc) + (xPix[x] - xFoc)*zFoc) * pixFocArrang >= 0.f) &&
+                             ((-(zPix[z] - zFoc)*(yElemConst[txApFstElemY[iTx]] - yFoc) - (yPix[y] - yFoc)*zFoc) * pixFocArrang >= 0.f) &&
+                             (( (zPix[z] - zFoc)*(yElemConst[txApLstElemY[iTx]] - yFoc) + (yPix[y] - yFoc)*zFoc) * pixFocArrang >= 0.f)
+                     ) ? 1.f : 0.f;
         }
         else {
             /* PWI */
@@ -111,9 +114,9 @@ __global__ void iqRaw2Lri3D(complex<float> *iqLri, const complex<float> *input,
             // to determine if the pixel is in the sonified area (dot product >= 0).
             // For ApEdgeFst, the vector is rotated left, for ApEdgeLst the vector is rotated right.
             txApod = (((-sinf(txAngZX[iTx])*zPix[z] + cosf(txAngZX[iTx]) * (xPix[x]-xElemConst[txApFstElemX[iTx]])) >= 0.f) &&
-                    (   (sinf(txAngZX[iTx])*zPix[z] - cosf(txAngZX[iTx]) * (xPix[x]-xElemConst[txApLstElemX[iTx]])) >= 0.f) &&
-                    (  (-sinf(txAngZY[iTx])*zPix[z] + cosf(txAngZY[iTx]) * (yPix[x]-yElemConst[txApFstElemY[iTx]])) >= 0.f) &&
-                    (   (sinf(txAngZY[iTx])*zPix[z] - cosf(txAngZY[iTx]) * (yPix[x]-yElemConst[txApLstElemY[iTx]])) >= 0.f )) ? 1.f : 0.f;
+                (   (sinf(txAngZX[iTx])*zPix[z] - cosf(txAngZX[iTx]) * (xPix[x]-xElemConst[txApLstElemX[iTx]])) >= 0.f) &&
+                (  (-sinf(txAngZY[iTx])*zPix[z] + cosf(txAngZY[iTx]) * (yPix[y]-yElemConst[txApFstElemY[iTx]])) >= 0.f) &&
+                (   (sinf(txAngZY[iTx])*zPix[z] - cosf(txAngZY[iTx]) * (yPix[y]-yElemConst[txApLstElemY[iTx]])) >= 0.f )) ? 1.f : 0.f;
         }
 
         pix.real(0.0f);
@@ -122,15 +125,17 @@ __global__ void iqRaw2Lri3D(complex<float> *iqLri, const complex<float> *input,
 
         if (txApod != 0.f) {
             for (int iElemY = 0; iElemY < nElemY; ++iElemY) {
-                rxTang = (yPix[y]-yElemConst[iElemY])*zDistInv;
+                int rxFstElemY = iElemY+rxApFstElemY[iTx]; // global, the position in the full probe's aperture
+                rxTang = (yPix[y]-yElemConst[rxFstElemY])*zDistInv;
                 if (rxTang < minRxTangZY || rxTang > maxRxTangZY) continue;
                 rxApodY = interpLinearNormalized(rxApod, nRxApod, (rxTang-minRxTangZY)*rngRxTangZYInv);
                 for (int iElemX = 0; iElemX < nElemX; ++iElemX) {
+                    int rxFstElemX = iElemX+rxApFstElemX[iTx];
                     offset = iTx*nElemY*nElemX*nSamp + iElemY*nElemX*nSamp + iElemX*nSamp;
-                    rxTang = (xPix[x] - xElemConst[iElemX])*zDistInv;
+                    rxTang = (xPix[x] - xElemConst[rxFstElemX])*zDistInv;
                     if (rxTang < minRxTangZX || rxTang > maxRxTangZX) continue;
                     rxApodX = interpLinearNormalized(rxApod, nRxApod, (rxTang-minRxTangZX)*rngRxTangZXInv);
-                    rxDist = ownHypotf(zPix[z], xPix[x] - xElemConst[iElemX], yPix[y] - yElemConst[iElemY]);
+                    rxDist = ownHypotf(zPix[z], xPix[x] - xElemConst[rxFstElemX], yPix[y] - yElemConst[rxFstElemY]);
                     time = (txDist+rxDist)*sosInv + initDel;
                     iSamp = time*fs;
                     if (iSamp < 0 || iSamp > static_cast<float>(nSamp-1)) continue;