Implemented custom FIR filter.

us4useu · Apr 2, 2021 · b7de19a · b7de19a
1 parent 39eb58f
commit b7de19a
Show file tree

Hide file tree

Showing 4 changed files with 159 additions and 8 deletions.
diff --git a/api/python/arrus/utils/fir.py b/api/python/arrus/utils/fir.py
@@ -2,6 +2,7 @@
 
 # TODO currently only complex input data is supported (part of DDC)
 
+# Complex, Complex
 _gpu_fir_complex64_str = r'''
 #include <cupy/complex.cuh>
 
@@ -72,5 +73,70 @@ def run_fir(grid_size, block_size, params, shared_mem_size):
     return gpu_fir_complex64(grid_size, block_size, params, shared_mem=shared_mem_size)
 
 
+# TODO refactor the below
+_gpu_fir_int16_float32_str = r'''
+extern "C" __global__ void gpu_fir_int16_float32(
+    float* __restrict__ output, const short* __restrict__ input, 
+    const int nSamples, const int length, 
+    const float* __restrict__ kernel, const int kernelWidth) {
+    
+    int idx = threadIdx.x + blockIdx.x*blockDim.x; 
+    int ch = idx / nSamples;
+    int sample = idx % nSamples;
+    
+    extern __shared__ char sharedMemory[];
+    
+    float* cachedInputData = (float*)sharedMemory;
+    float* cachedKernel = (float*)(sharedMemory+(kernelWidth+blockDim.x)*sizeof(float));
+    
+    // Cache kernel.
+    for(int i = threadIdx.x;  i < kernelWidth; i += blockDim.x) {
+        cachedKernel[i] = kernel[i];
+    }        
+    
+    // Cache input.
+    for(int i = sample-kernelWidth/2-1, localIdx = threadIdx.x; 
+        localIdx < (kernelWidth+blockDim.x); i += blockDim.x, localIdx += blockDim.x) {
+        if (i < 0 || i >= nSamples) {
+            cachedInputData[localIdx] = 0.0f;
+        }
+        else {
+            cachedInputData[localIdx] = (float)input[ch*nSamples + i];
+        }
+        
+    }
+
+    __syncthreads();
 
+    if(idx >= length) {
+        return;
+    }
+    float result = 0.0f;
+    int localN = threadIdx.x + kernelWidth;
+    for (int i = 0; i < kernelWidth; ++i) {
+        result += cachedInputData[localN-i]*cachedKernel[i];
+    }
+    output[idx] = result;
+}
+'''
+
+
+gpu_fir_int16_float32 = cp.RawKernel(_gpu_fir_int16_float32_str, "gpu_fir_int16_float32")
 
+
+def get_default_grid_block_size_fir_int16(n_samples, total_n_samples):
+    block_size = (min((n_samples, _DEFAULT_BLOCK_SIZE)), )
+    grid_size = (int((total_n_samples-1) // block_size[0] + 1), )
+    return (grid_size, block_size)
+
+
+def get_default_shared_mem_size_fir_int16(n_samples, filter_size):
+    #  filter size (actual filter coefficients) + (block size + filter_size (padding))
+    return filter_size*4 + (min(n_samples, _DEFAULT_BLOCK_SIZE) + filter_size)*4
+
+
+def run_fir_int16(grid_size, block_size, params, shared_mem_size):
+    """
+    :param params: a list: data_out, data_in, input_n_samples, input_total_n_samples, kernel, kernel_size
+    """
+    return gpu_fir_int16_float32(grid_size, block_size, params, shared_mem=shared_mem_size)
diff --git a/api/python/arrus/utils/imaging.py b/api/python/arrus/utils/imaging.py
@@ -69,7 +69,6 @@ class BandpassFilter:
 
     Currently only FIR filter is available.
     """
-
     def __init__(self, numtaps=7, bounds=(0.5, 1.5), filter_type="butter",
                  num_pkg=None, filter_pkg=None):
         """
@@ -96,16 +95,73 @@ def _prepare(self, const_metadata: arrus.metadata.ConstMetadata):
         sampling_frequency = const_metadata.data_description.sampling_frequency
         # FIXME(pjarosik) implement iir filter
         taps, _ = scipy.signal.butter(
-                2,
-                [l * center_frequency, r * center_frequency],
-                btype='bandpass', fs=sampling_frequency)
+            2,
+            [l * center_frequency, r * center_frequency],
+            btype='bandpass', fs=sampling_frequency)
         self.taps = self.xp.asarray(taps).astype(self.xp.float32)
         return const_metadata
 
     def __call__(self, data):
-        result = self.filter_pkg.convolve1d(data, self.taps, axis=-1,
-                                            mode='constant')
-        return result
+        return self.filter_pkg.convolve1d(data.astype(self.xp.float32),
+                                          self.taps, axis=-1,
+                                          mode='constant')
+
+
+class FirFilter:
+
+    def __init__(self, taps, num_pkg=None, filter_pkg=None):
+        """
+        Bandpass filter constructor.
+
+        :param bounds: determines filter's frequency boundaries,
+            e.g. setting 0.5 will give a bandpass filter
+            [0.5*center_frequency, 1.5*center_frequency].
+        """
+        self.taps = taps
+        self.xp = num_pkg
+        self.filter_pkg = filter_pkg
+        self.convolve1d_func = None
+        self.dumped = 0
+
+    def set_pkgs(self, num_pkg, filter_pkg, **kwargs):
+        self.xp = num_pkg
+        self.filter_pkg = filter_pkg
+
+    def _prepare(self, const_metadata: arrus.metadata.ConstMetadata):
+        if self.xp == np:
+            raise ValueError("This operation is NYI for CPU")
+        import cupy as cp
+        self.taps = cp.asarray(self.taps).astype(cp.float32)
+        n_taps = len(self.taps)
+
+        n_frames, n_channels, n_samples = const_metadata.input_shape
+        total_n_samples = n_frames*n_channels*n_samples
+
+        fir_output_buffer = cp.zeros(const_metadata.input_shape, dtype=cp.float32)
+        from arrus.utils.fir import (
+            run_fir_int16,
+            get_default_grid_block_size_fir_int16,
+            get_default_shared_mem_size_fir_int16
+        )
+        grid_size, block_size = get_default_grid_block_size_fir_int16(
+            n_samples,
+            total_n_samples)
+        shared_memory_size = get_default_shared_mem_size_fir_int16(
+            n_samples, n_taps)
+
+        def gpu_convolve1d(data):
+            data = cp.ascontiguousarray(data)
+            run_fir_int16(
+                grid_size, block_size,
+                (fir_output_buffer, data, n_samples,
+                total_n_samples, self.taps, n_taps),
+                shared_memory_size)
+            return fir_output_buffer
+        self.convolve1d_func = gpu_convolve1d
+        return const_metadata
+
+    def __call__(self, data):
+        return self.convolve1d_func(data)
 
 
 class QuadratureDemodulation:

diff --git a/api/python/arrus/utils/us4r.py b/api/python/arrus/utils/us4r.py
@@ -201,7 +201,6 @@ def cpu_remap_fn(data):
             self._remap_fn = cpu_remap_fn
         else:
             # GPU
-            print("Using new GPU kernel")
             import cupy as cp
             self._fcm_frames = cp.asarray(fcm.frames)
             self._fcm_channels = cp.asarray(fcm.channels)

diff --git a/api/python/wrappers/core.i b/api/python/wrappers/core.i
@@ -85,6 +85,14 @@ using namespace ::arrus;
 %include "arrus/core/api/common/Logger.h"
 
 %inline %{
+    struct GILReleaser {
+        GILReleaser(): save(PyEval_SaveThread()) {}
+        ~GILReleaser() {
+            PyEval_RestoreThread(save);
+        }
+        PyThreadState *save;
+    };
+
     std::shared_ptr<::arrus::Logging> LOGGING_FACTORY;
 
     // TODO consider moving the below function to %init
@@ -189,6 +197,28 @@ double getPitch(const arrus::devices::ProbeModel &probe) {
     }
     return pitch[0];
 }
+
+size_t getTailAddress(std::shared_ptr<arrus::devices::HostBuffer> buffer) {
+//    size_t tailPtr = 0;
+//    std::cout << "Relasing python thread" << std::endl;
+//    Py_BEGIN_ALLOW_THREADS
+//    std::cout << "Python thread released" << std::endl;
+//    //{
+//    //    GILReleaser releaser;
+    return buffer->tailAddress(-1);
+    //}
+//    std::cout << "ACQ python thread" << std::endl;
+//    Py_END_ALLOW_THREADS
+//    std::cout << "ACQ thread realesed" << std::endl;
+//    return tailPtr;
+}
+
+void releaseTail(std::shared_ptr<arrus::devices::HostBuffer> buffer) {
+//    Py_BEGIN_ALLOW_THREADS
+    buffer->releaseTail(-1);
+//    Py_END_ALLOW_THREADS
+}
+
 %};
 
 // ------------------------------------------ COMMON