Skip to content

Commit

Permalink
Implemented custom FIR filter.
Browse files Browse the repository at this point in the history
  • Loading branch information
pjarosik committed Apr 2, 2021
1 parent 39eb58f commit b7de19a
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 8 deletions.
66 changes: 66 additions & 0 deletions api/python/arrus/utils/fir.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# TODO currently only complex input data is supported (part of DDC)

# Complex, Complex
_gpu_fir_complex64_str = r'''
#include <cupy/complex.cuh>
Expand Down Expand Up @@ -72,5 +73,70 @@ def run_fir(grid_size, block_size, params, shared_mem_size):
return gpu_fir_complex64(grid_size, block_size, params, shared_mem=shared_mem_size)


# TODO refactor the below
_gpu_fir_int16_float32_str = r'''
extern "C" __global__ void gpu_fir_int16_float32(
float* __restrict__ output, const short* __restrict__ input,
const int nSamples, const int length,
const float* __restrict__ kernel, const int kernelWidth) {
int idx = threadIdx.x + blockIdx.x*blockDim.x;
int ch = idx / nSamples;
int sample = idx % nSamples;
extern __shared__ char sharedMemory[];
float* cachedInputData = (float*)sharedMemory;
float* cachedKernel = (float*)(sharedMemory+(kernelWidth+blockDim.x)*sizeof(float));
// Cache kernel.
for(int i = threadIdx.x; i < kernelWidth; i += blockDim.x) {
cachedKernel[i] = kernel[i];
}
// Cache input.
for(int i = sample-kernelWidth/2-1, localIdx = threadIdx.x;
localIdx < (kernelWidth+blockDim.x); i += blockDim.x, localIdx += blockDim.x) {
if (i < 0 || i >= nSamples) {
cachedInputData[localIdx] = 0.0f;
}
else {
cachedInputData[localIdx] = (float)input[ch*nSamples + i];
}
}
__syncthreads();
if(idx >= length) {
return;
}
float result = 0.0f;
int localN = threadIdx.x + kernelWidth;
for (int i = 0; i < kernelWidth; ++i) {
result += cachedInputData[localN-i]*cachedKernel[i];
}
output[idx] = result;
}
'''


gpu_fir_int16_float32 = cp.RawKernel(_gpu_fir_int16_float32_str, "gpu_fir_int16_float32")


def get_default_grid_block_size_fir_int16(n_samples, total_n_samples):
block_size = (min((n_samples, _DEFAULT_BLOCK_SIZE)), )
grid_size = (int((total_n_samples-1) // block_size[0] + 1), )
return (grid_size, block_size)


def get_default_shared_mem_size_fir_int16(n_samples, filter_size):
# filter size (actual filter coefficients) + (block size + filter_size (padding))
return filter_size*4 + (min(n_samples, _DEFAULT_BLOCK_SIZE) + filter_size)*4


def run_fir_int16(grid_size, block_size, params, shared_mem_size):
"""
:param params: a list: data_out, data_in, input_n_samples, input_total_n_samples, kernel, kernel_size
"""
return gpu_fir_int16_float32(grid_size, block_size, params, shared_mem=shared_mem_size)
70 changes: 63 additions & 7 deletions api/python/arrus/utils/imaging.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ class BandpassFilter:
Currently only FIR filter is available.
"""

def __init__(self, numtaps=7, bounds=(0.5, 1.5), filter_type="butter",
num_pkg=None, filter_pkg=None):
"""
Expand All @@ -96,16 +95,73 @@ def _prepare(self, const_metadata: arrus.metadata.ConstMetadata):
sampling_frequency = const_metadata.data_description.sampling_frequency
# FIXME(pjarosik) implement iir filter
taps, _ = scipy.signal.butter(
2,
[l * center_frequency, r * center_frequency],
btype='bandpass', fs=sampling_frequency)
2,
[l * center_frequency, r * center_frequency],
btype='bandpass', fs=sampling_frequency)
self.taps = self.xp.asarray(taps).astype(self.xp.float32)
return const_metadata

def __call__(self, data):
result = self.filter_pkg.convolve1d(data, self.taps, axis=-1,
mode='constant')
return result
return self.filter_pkg.convolve1d(data.astype(self.xp.float32),
self.taps, axis=-1,
mode='constant')


class FirFilter:

def __init__(self, taps, num_pkg=None, filter_pkg=None):
"""
Bandpass filter constructor.
:param bounds: determines filter's frequency boundaries,
e.g. setting 0.5 will give a bandpass filter
[0.5*center_frequency, 1.5*center_frequency].
"""
self.taps = taps
self.xp = num_pkg
self.filter_pkg = filter_pkg
self.convolve1d_func = None
self.dumped = 0

def set_pkgs(self, num_pkg, filter_pkg, **kwargs):
self.xp = num_pkg
self.filter_pkg = filter_pkg

def _prepare(self, const_metadata: arrus.metadata.ConstMetadata):
if self.xp == np:
raise ValueError("This operation is NYI for CPU")
import cupy as cp
self.taps = cp.asarray(self.taps).astype(cp.float32)
n_taps = len(self.taps)

n_frames, n_channels, n_samples = const_metadata.input_shape
total_n_samples = n_frames*n_channels*n_samples

fir_output_buffer = cp.zeros(const_metadata.input_shape, dtype=cp.float32)
from arrus.utils.fir import (
run_fir_int16,
get_default_grid_block_size_fir_int16,
get_default_shared_mem_size_fir_int16
)
grid_size, block_size = get_default_grid_block_size_fir_int16(
n_samples,
total_n_samples)
shared_memory_size = get_default_shared_mem_size_fir_int16(
n_samples, n_taps)

def gpu_convolve1d(data):
data = cp.ascontiguousarray(data)
run_fir_int16(
grid_size, block_size,
(fir_output_buffer, data, n_samples,
total_n_samples, self.taps, n_taps),
shared_memory_size)
return fir_output_buffer
self.convolve1d_func = gpu_convolve1d
return const_metadata

def __call__(self, data):
return self.convolve1d_func(data)


class QuadratureDemodulation:
Expand Down
1 change: 0 additions & 1 deletion api/python/arrus/utils/us4r.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ def cpu_remap_fn(data):
self._remap_fn = cpu_remap_fn
else:
# GPU
print("Using new GPU kernel")
import cupy as cp
self._fcm_frames = cp.asarray(fcm.frames)
self._fcm_channels = cp.asarray(fcm.channels)
Expand Down
30 changes: 30 additions & 0 deletions api/python/wrappers/core.i
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ using namespace ::arrus;
%include "arrus/core/api/common/Logger.h"

%inline %{
struct GILReleaser {
GILReleaser(): save(PyEval_SaveThread()) {}
~GILReleaser() {
PyEval_RestoreThread(save);
}
PyThreadState *save;
};

std::shared_ptr<::arrus::Logging> LOGGING_FACTORY;

// TODO consider moving the below function to %init
Expand Down Expand Up @@ -189,6 +197,28 @@ double getPitch(const arrus::devices::ProbeModel &probe) {
}
return pitch[0];
}

size_t getTailAddress(std::shared_ptr<arrus::devices::HostBuffer> buffer) {
// size_t tailPtr = 0;
// std::cout << "Relasing python thread" << std::endl;
// Py_BEGIN_ALLOW_THREADS
// std::cout << "Python thread released" << std::endl;
// //{
// // GILReleaser releaser;
return buffer->tailAddress(-1);
//}
// std::cout << "ACQ python thread" << std::endl;
// Py_END_ALLOW_THREADS
// std::cout << "ACQ thread realesed" << std::endl;
// return tailPtr;
}

void releaseTail(std::shared_ptr<arrus::devices::HostBuffer> buffer) {
// Py_BEGIN_ALLOW_THREADS
buffer->releaseTail(-1);
// Py_END_ALLOW_THREADS
}

%};

// ------------------------------------------ COMMON
Expand Down

0 comments on commit b7de19a

Please sign in to comment.