Skip to content

Commit

Permalink
Improved iq raw 2 LRI performance.
Browse files Browse the repository at this point in the history
Moved the probe geometry to constant memory, changed kernel  grid of threads dimensions.
  • Loading branch information
pjarosik committed Nov 12, 2021
1 parent 02d71df commit 906f931
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 99 deletions.
47 changes: 36 additions & 11 deletions api/python/arrus/utils/imaging.py
Original file line number Diff line number Diff line change
Expand Up @@ -1435,18 +1435,21 @@ def __init__(self, x_grid, z_grid, rx_tang_limits=None):

def set_pkgs(self, num_pkg, **kwargs):
if num_pkg is np:
raise ValueError("Currently reconstructLri operation is "
"implemented for GPU only.")
raise ValueError("ReconstructLri operation is implemented for GPU only.")

def prepare(self, const_metadata):
from pathlib import Path
import os
import cupy as cp

current_dir = os.path.dirname(os.path.join(os.path.abspath(__file__)))
_kernel_source = Path(os.path.join(current_dir, "iq_raw_2_lri.cu")).read_text()
self._kernel = self.num_pkg.RawKernel(_kernel_source, "iqRaw2Lri")
self._kernel_module = self.num_pkg.RawModule(code=_kernel_source)
self._kernel = self._kernel_module.get_function("iqRaw2Lri")
self._z_elem_const = self._kernel_module.get_global("zElemConst")
self._tang_elem_const = self._kernel_module.get_global("tangElemConst")

# INPUT PARAMETERS.

# Input data shape.
self.n_tx, self.n_rx, self.n_samples = const_metadata.input_shape

Expand All @@ -1461,10 +1464,13 @@ def prepare(self, const_metadata):
self.output_buffer = self.num_pkg.zeros(output_shape, dtype=self.num_pkg.complex64)
x_block_size = min(self.x_size, 16)
z_block_size = min(self.z_size, 16)
self.block_size = (z_block_size, x_block_size, 1)
tx_block_size = min(self.n_tx, 4)
self.block_size = (z_block_size, x_block_size, tx_block_size)
self.grid_size = (int((self.z_size-1)//z_block_size + 1),
int((self.x_size-1)//x_block_size + 1),
self.n_tx)
int((self.n_tx-1)//tx_block_size + 1))
print(self.block_size)
print(self.grid_size)
self.x_pix = self.num_pkg.asarray(self.x_grid, dtype=self.num_pkg.float32)
self.z_pix = self.num_pkg.asarray(self.z_grid, dtype=self.num_pkg.float32)

Expand All @@ -1478,11 +1484,23 @@ def prepare(self, const_metadata):
element_pos_x = probe_model.element_pos_x
element_pos_z = probe_model.element_pos_z
element_angle_tang = np.tan(probe_model.element_angle)
self.x_elem = self.num_pkg.asarray(element_pos_x, dtype=self.num_pkg.float32)
self.z_elem = self.num_pkg.asarray(element_pos_z, dtype=self.num_pkg.float32)
self.tang_elem = self.num_pkg.asarray(element_angle_tang, dtype=self.num_pkg.float32)

self.n_elements = probe_model.n_elements

device_props = cp.cuda.runtime.getDeviceProperties(0)
if device_props["totalConstMem"] < 256*3*4: # 3 float32 arrays, 256 elements max
raise ValueError("There is not enough constant memory available!")

x_elem = np.asarray(element_pos_x, dtype=self.num_pkg.float32)
self._x_elem_const = _get_const_memory_array(
self._kernel_module, name="xElemConst", input_array=x_elem)
z_elem = np.asarray(element_pos_z, dtype=self.num_pkg.float32)
self._z_elem_const = _get_const_memory_array(
self._kernel_module, name="zElemConst", input_array=z_elem)
tang_elem = np.asarray(element_angle_tang, dtype=self.num_pkg.float32)
self._tang_elem_const = _get_const_memory_array(
self._kernel_module, name="tangElemConst", input_array=tang_elem)

# TX aperture description
# Convert the sequence to the positions of the aperture centers
tx_rx_params = arrus.kernels.imaging.compute_tx_rx_params(
Expand Down Expand Up @@ -1530,8 +1548,7 @@ def process(self, data):
params = (
self.output_buffer,
data,
self.x_elem, self.z_elem, self.tang_elem, self.n_elements, # DONE
self.n_tx, self.n_samples,
self.n_elements, self.n_tx, self.n_samples,
self.z_pix, self.z_size,
self.x_pix, self.x_size,
self.sos, self.fs, self.fn,
Expand Down Expand Up @@ -1757,3 +1774,11 @@ def process(self, data):
self._remap_fn(data)
return self._output_buffer


def _get_const_memory_array(module, name, input_array):
import cupy as cp
const_arr_ptr = module.get_global(name)
const_arr = cp.ndarray(shape=input_array.shape, dtype=input_array.dtype,
memptr=const_arr_ptr)
const_arr.set(input_array)
return const_arr
174 changes: 86 additions & 88 deletions api/python/arrus/utils/iq_raw_2_lri.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

#define CUDART_PI_F 3.141592654f

__constant__ float zElemConst[256];
__constant__ float xElemConst[256];
__constant__ float tangElemConst[256];

extern "C"
__global__ void
iqRaw2Lri(complex<float> *iqLri, const complex<float> *iqRaw,
const float *xElem, const float *zElem, const float *tangElem, const int nElem,
const int nTx, const int nSamp,
const int nElem, const int nTx, const int nSamp,
const float *zPix, const int nZPix,
const float *xPix, const int nXPix,
float const sos, float const fs, float const fn,
Expand All @@ -19,8 +22,9 @@ iqRaw2Lri(complex<float> *iqLri, const complex<float> *iqRaw,

int z = blockIdx.x * blockDim.x + threadIdx.x;
int x = blockIdx.y * blockDim.y + threadIdx.y;
int iTx = blockIdx.z * blockDim.z + threadIdx.z;

if (z >= nZPix || x >= nXPix) {
if(z >= nZPix || x >= nXPix || iTx >= nTx) {
return;
}

Expand All @@ -36,93 +40,87 @@ iqRaw2Lri(complex<float> *iqLri, const complex<float> *iqRaw,
const float centRxTang = (maxRxTang + minRxTang) * 0.5f;
complex<float> pix(0.0f, 0.0f), samp(0.0f, 0.0f), modFactor;

for (int iTx = 0; iTx < nTx; ++iTx) {
int txOffset = iTx*nSamp*nRx;

if (!isinf(txFoc[iTx])) {
/* STA */
float zFoc = txApCentZ[iTx] + txFoc[iTx]*cosf(txAngZX[iTx]);
float xFoc = txApCentX[iTx] + txFoc[iTx]*sinf(txAngZX[iTx]);

float pixFocArrang;

if (txFoc[iTx] <= 0.0f) {
/* Virtual Point Source BEHIND probe surface */
// Valid pixels are assumed to be always in front of the focal point (VSP)
pixFocArrang = 1.0f;
}
else {
/* Virtual Point Source IN FRONT OF probe surface */
// Projection of the Foc-Pix vector on the ApCent-Foc vector (dot product) ...
// to determine if the pixel is behind (-) or in front of (+) the focal point (VSP).
pixFocArrang = (((zPix[z]-zFoc)*(zFoc-txApCentZ[iTx]) +
(xPix[x]-xFoc)*(xFoc-txApCentX[iTx])) >= 0.f) ? 1.f : -1.f;
}
txDist = hypotf(zPix[z] - zFoc, xPix[x] - xFoc);
txDist *= pixFocArrang; // Compensation for the Pix-Foc arrangement
txDist += txFoc[iTx]; // Compensation for the reference time being the moment when txApCent fires.

// Projections of Foc-Pix vector on the rotated Foc-ApEdge vectors (dot products) ...
// to determine if the pixel is in the sonified area (dot product >= 0).
// Foc-ApEdgeFst vector is rotated left, Foc-ApEdgeLst vector is rotated right.
txApod = ( ( (-(xElem[txApFstElem[iTx]] - xFoc)*(zPix[z] - zFoc) +
(zElem[txApFstElem[iTx]] - zFoc)*(xPix[x] - xFoc))*pixFocArrang >= 0.f ) &&
( ( (xElem[txApLstElem[iTx]] - xFoc)*(zPix[z] - zFoc) -
(zElem[txApLstElem[iTx]] - zFoc)*(xPix[x] - xFoc))*pixFocArrang >= 0.f ) ) ? 1.f : 0.f;
}
else {
/* PWI */
txDist = (zPix[z] - txApCentZ[iTx]) * cosf(txAngZX[iTx]) +
(xPix[x] - txApCentX[iTx]) * sinf(txAngZX[iTx]);

// Projections of ApEdge-Pix vector on the rotated unit vector of tx direction (dot products) ...
// to determine if the pixel is in the sonified area (dot product >= 0).
// For ApEdgeFst, the vector is rotated left, for ApEdgeLst the vector is rotated right.
txApod = (((-(zPix[z] - zElem[txApFstElem[iTx]])*sinf(txAngZX[iTx]) +
(xPix[x] - xElem[txApFstElem[iTx]])*cosf(txAngZX[iTx])) >= 0.f) &&
(((zPix[z] - zElem[txApLstElem[iTx]])*sinf(txAngZX[iTx]) -
(xPix[x] - xElem[txApLstElem[iTx]])*cosf(txAngZX[iTx])) >= 0.f)) ? 1.f : 0.f;
int txOffset = iTx * nSamp * nRx;

if(!isinf(txFoc[iTx])) {
/* STA */
float zFoc = txApCentZ[iTx] + txFoc[iTx] * cosf(txAngZX[iTx]);
float xFoc = txApCentX[iTx] + txFoc[iTx] * sinf(txAngZX[iTx]);

float pixFocArrang;

if(txFoc[iTx] <= 0.0f) {
/* Virtual Point Source BEHIND probe surface */
// Valid pixels are assumed to be always in front of the focal point (VSP)
pixFocArrang = 1.0f;
} else {
/* Virtual Point Source IN FRONT OF probe surface */
// Projection of the Foc-Pix vector on the ApCent-Foc vector (dot product) ...
// to determine if the pixel is behind (-) or in front of (+) the focal point (VSP).
pixFocArrang = (((zPix[z] - zFoc) * (zFoc - txApCentZ[iTx]) +
(xPix[x] - xFoc) * (xFoc - txApCentX[iTx])) >= 0.f) ? 1.f : -1.f;
}
pixWgh = 0.0f;
pix.real(0.0f);
pix.imag(0.0f);

if (txApod != 0.0f) {
for (int iRx = 0; iRx < nRx; iRx++) {
iElem = iRx + rxApOrigElem[iTx];
if (iElem < 0 || iElem >= nElem) continue;

rxDist = hypotf(xPix[x] - xElem[iElem], zPix[z] - zElem[iElem]);
rxTang = __fdividef(xPix[x] - xElem[iElem], zPix[z] - zElem[iElem]);
rxTang = __fdividef(rxTang - tangElem[iElem], 1.f + rxTang*tangElem[iElem]);
if (rxTang < minRxTang || rxTang > maxRxTang) continue;

rxApod = (rxTang - centRxTang) * rngRxTangInv;
rxApod = __expf(-rxApod * rxApod * twoSigSqrInv);

time = (txDist + rxDist)*sosInv + initDel;
iSamp = time * fs;
if (iSamp < 0.0f || iSamp >= static_cast<float>(nSamp - 1)) {
continue;
}
offset = txOffset + iRx*nSamp;
interpWgh = modff(iSamp, &iSamp);
int intSamp = int(iSamp);

__sincosf(omega*time, &modSin, &modCos);
complex<float> modFactor = complex<float>(modCos, modSin);

samp = iqRaw[offset+intSamp]*(1-interpWgh) + iqRaw[offset+intSamp+1]*interpWgh;
pix += samp*modFactor*rxApod;
pixWgh += rxApod;
txDist = hypotf(zPix[z] - zFoc, xPix[x] - xFoc);
txDist *= pixFocArrang; // Compensation for the Pix-Foc arrangement
txDist += txFoc[iTx]; // Compensation for the reference time being the moment when txApCent fires.

// Projections of Foc-Pix vector on the rotated Foc-ApEdge vectors (dot products) ...
// to determine if the pixel is in the sonified area (dot product >= 0).
// Foc-ApEdgeFst vector is rotated left, Foc-ApEdgeLst vector is rotated right.
txApod = (((-(xElemConst[txApFstElem[iTx]] - xFoc) * (zPix[z] - zFoc) +
(zElemConst[txApFstElem[iTx]] - zFoc) * (xPix[x] - xFoc)) * pixFocArrang >= 0.f) &&
(((xElemConst[txApLstElem[iTx]] - xFoc) * (zPix[z] - zFoc) -
(zElemConst[txApLstElem[iTx]] - zFoc) * (xPix[x] - xFoc)) * pixFocArrang >= 0.f)) ? 1.f : 0.f;
} else {
/* PWI */
txDist = (zPix[z] - txApCentZ[iTx]) * cosf(txAngZX[iTx]) +
(xPix[x] - txApCentX[iTx]) * sinf(txAngZX[iTx]);

// Projections of ApEdge-Pix vector on the rotated unit vector of tx direction (dot products) ...
// to determine if the pixel is in the sonified area (dot product >= 0).
// For ApEdgeFst, the vector is rotated left, for ApEdgeLst the vector is rotated right.
txApod = (((-(zPix[z] - zElemConst[txApFstElem[iTx]]) * sinf(txAngZX[iTx]) +
(xPix[x] - xElemConst[txApFstElem[iTx]]) * cosf(txAngZX[iTx])) >= 0.f) &&
(((zPix[z] - zElemConst[txApLstElem[iTx]]) * sinf(txAngZX[iTx]) -
(xPix[x] - xElemConst[txApLstElem[iTx]]) * cosf(txAngZX[iTx])) >= 0.f)) ? 1.f : 0.f;
}
pixWgh = 0.0f;
pix.real(0.0f);
pix.imag(0.0f);

if(txApod != 0.0f) {
for(int iRx = 0; iRx < nRx; iRx++) {
iElem = iRx + rxApOrigElem[iTx];
if(iElem < 0 || iElem >= nElem) continue;

rxDist = hypotf(xPix[x] - xElemConst[iElem], zPix[z] - zElemConst[iElem]);
rxTang = __fdividef(xPix[x] - xElemConst[iElem], zPix[z] - zElemConst[iElem]);
rxTang = __fdividef(rxTang - tangElemConst[iElem], 1.f + rxTang * tangElemConst[iElem]);
if(rxTang < minRxTang || rxTang > maxRxTang) continue;

rxApod = (rxTang - centRxTang) * rngRxTangInv;
rxApod = __expf(-rxApod * rxApod * twoSigSqrInv);

time = (txDist + rxDist) * sosInv + initDel;
iSamp = time * fs;
if(iSamp < 0.0f || iSamp >= static_cast<float>(nSamp - 1)) {
continue;
}
}
if(pixWgh == 0.0f) {
iqLri[z + x*nZPix + iTx*nZPix*nXPix] = complex<float>(0.0f, 0.0f);
}
else {
iqLri[z + x * nZPix + iTx * nZPix * nXPix] = pix/pixWgh*txApod;
offset = txOffset + iRx * nSamp;
interpWgh = modff(iSamp, &iSamp);
int intSamp = int(iSamp);

__sincosf(omega * time, &modSin, &modCos);
complex<float> modFactor = complex<float>(modCos, modSin);

samp = iqRaw[offset + intSamp] * (1 - interpWgh) + iqRaw[offset + intSamp + 1] * interpWgh;
pix += samp * modFactor * rxApod;
pixWgh += rxApod;
}
}

if(pixWgh == 0.0f) {
iqLri[z + x * nZPix + iTx * nZPix * nXPix] = complex<float>(0.0f, 0.0f);
} else {
iqLri[z + x * nZPix + iTx * nZPix * nXPix] = pix / pixWgh * txApod;
}
}

0 comments on commit 906f931

Please sign in to comment.