From 6c9c0850e96c3507c60199534c08deec73349dd3 Mon Sep 17 00:00:00 2001
From: Piotr Jarosik <jarosik.piotr@gmail.com>
Date: Thu, 23 Dec 2021 23:23:23 +0100
Subject: [PATCH] Made arrus.utils.imaging compatible with batch-wise
 processing.

---
 api/python/arrus/utils/imaging.py        | 290 ++++++-----------------
 api/python/arrus/utils/interpolate.py    |   3 +-
 api/python/arrus/utils/iq_raw_2_lri.cu   |  14 +-
 api/python/arrus/utils/rx_beamforming.cu |  16 +-
 api/python/examples/reconstruct_lin.py   |  58 +++++
 docs/content/misc/release_notes.rst      |  13 +
 6 files changed, 157 insertions(+), 237 deletions(-)
 create mode 100644 api/python/examples/reconstruct_lin.py

diff --git a/api/python/arrus/utils/imaging.py b/api/python/arrus/utils/imaging.py
index a5c55a12a..fee607a38 100644
--- a/api/python/arrus/utils/imaging.py
+++ b/api/python/arrus/utils/imaging.py
@@ -75,16 +75,19 @@ def get_bmode_imaging(sequence, grid, placement="/GPU:0",
             steps=(
                 # Channel data pre-processing.
                 RemapToLogicalOrder(),
-                Transpose(axes=(0, 2, 1)),
+                Transpose(axes=(0, 1, 3, 2)),
                 BandpassFilter(),
                 QuadratureDemodulation(),
                 Decimation(decimation_factor=decimation_factor,
                            cic_order=decimation_cic_order),
                 # Data beamforming.
                 ReconstructLri(x_grid=x_grid, z_grid=z_grid),
-                Mean(axis=0),
+                # IQ compounding
+                Mean(axis=1),  # Along tx axis.
                 # Post-processing to B-mode image.
                 EnvelopeDetection(),
+                # Envelope compounding
+                Mean(axis=0),
                 Transpose(),
                 LogCompression()
             ),
@@ -255,7 +258,7 @@ class Operation:
 
     def prepare(self, const_metadata):
         """
-        Function that will called when the processing pipeline is prepared.
+        Function that will be called when the processing pipeline is prepared.
 
         :param const_metadata: const metadata describing output from the \
           previous Operation.
@@ -503,7 +506,6 @@ def prepare(self, const_metadata: arrus.metadata.ConstMetadata):
         l, r = self.bound_l, self.bound_r
         center_frequency = const_metadata.context.sequence.pulse.center_frequency
         sampling_frequency = const_metadata.data_description.sampling_frequency
-        # FIXME(pjarosik) implement iir filter
         taps, _ = scipy.signal.butter(
                 2,
                 [l * center_frequency, r * center_frequency],
@@ -543,8 +545,9 @@ def prepare(self, const_metadata: arrus.metadata.ConstMetadata):
         self.taps = cp.asarray(self.taps).astype(cp.float32)
         n_taps = len(self.taps)
 
-        n_frames, n_channels, n_samples = const_metadata.input_shape
-        total_n_samples = n_frames*n_channels*n_samples
+        input_shape = const_metadata.input_shape
+        n_samples = input_shape[-1]
+        total_n_samples = math.prod(input_shape)
 
         if total_n_samples == 0:
             raise ValueError("Empty array is not supported")
@@ -627,7 +630,8 @@ def prepare(self, const_metadata):
         xp = self.xp
         fs = const_metadata.data_description.sampling_frequency
         fc = const_metadata.context.sequence.pulse.center_frequency
-        _, _, n_samples = const_metadata.input_shape
+        input_shape = const_metadata.input_shape
+        n_samples = input_shape[-1]
         if n_samples == 0:
             raise ValueError("Empty array is not accepted.")
         t = (xp.arange(0, n_samples) / fs).reshape(1, 1, -1)
@@ -674,10 +678,10 @@ def prepare(self, const_metadata):
             sampling_frequency=new_fs, custom=
             const_metadata.data_description.custom)
 
-        n_frames, n_channels, n_samples = const_metadata.input_shape
-        total_n_samples = n_frames*n_channels*n_samples
-
-        output_shape = n_frames, n_channels, math.ceil(n_samples/self.decimation_factor)
+        input_shape = const_metadata.input_shape
+        n_samples = input_shape[-1]
+        total_n_samples = math.prod(input_shape)
+        output_shape = input_shape[:-1] + (math.ceil(n_samples/self.decimation_factor), )
 
         # CIC FIR coefficients
         if self.impl == "fir":
@@ -728,22 +732,22 @@ def process(self, data):
 
     def _fir_decimate(self, data):
         fir_output = self._fir_filter(data)
-        data_out = fir_output[:, :, 0::self.decimation_factor]
+        data_out = fir_output[..., 0::self.decimation_factor]
         return data_out
 
     def _legacy_decimate(self, data):
         data_out = data
         for i in range(self.cic_order):
             data_out = self.xp.cumsum(data_out, axis=-1)
-        data_out = data_out[:, :, 0::self.decimation_factor]
+        data_out = data_out[..., 0::self.decimation_factor]
         for i in range(self.cic_order):
-            data_out[:, :, 1:] = self.xp.diff(data_out, axis=-1)
+            data_out[..., 1:] = self.xp.diff(data_out, axis=-1)
         return data_out
 
 
 class RxBeamforming(Operation):
     """
-    Classical rx beamforming (reconstructing scanline by scanline).
+    Classical rx beamforming (reconstruct image scanline by scanline).
     This operator implements beamforming for linear scanning (element by element)
     and phased scanning (angle by angle).
     """
@@ -787,6 +791,7 @@ class RxBeamformingPhasedScanning(Operation):
     """
     Classical beamforming for phase array scanning.
     """
+
     def __init__(self, num_pkg=None):
         self.num_pkg = num_pkg
 
@@ -802,8 +807,8 @@ def prepare(self, const_metadata):
         self._kernel_module = _read_kernel_module("rx_beamforming.cu")
         self._kernel = self._kernel_module.get_function("beamformPhasedArray")
 
-        self.n_tx, self.n_rx, self.n_samples = const_metadata.input_shape
-        self.output_buffer = cp.zeros((self.n_tx, self.n_samples), dtype=cp.complex64)
+        self.n_seq, self.n_tx, self.n_rx, self.n_samples = const_metadata.input_shape
+        self.output_buffer = cp.zeros((self.n_seq, self.n_tx, self.n_samples), dtype=cp.complex64)
 
         seq = const_metadata.context.sequence
         self.tx_angles = cp.asarray(seq.angles, dtype=cp.float32)
@@ -840,12 +845,13 @@ def prepare(self, const_metadata):
         self.start_time = cp.float32(start_sample/acq_fs)
         self.init_delay = cp.float32(initial_delay)
         self.max_tang = cp.float32(max_tang)
-        scanline_block_size = min(self.n_tx, 16)
         sample_block_size = min(self.n_samples, 16)
-        self.block_size = (sample_block_size, scanline_block_size, 1)
+        scanline_block_size = min(self.n_tx, 16)
+        n_seq_block_size = min(self.n_seq, 4)
+        self.block_size = (sample_block_size, scanline_block_size, n_seq_block_size)
         self.grid_size = (int((self.n_samples-1)//sample_block_size + 1),
                           int((self.n_tx-1)//scanline_block_size + 1),
-                          1)
+                          int((self.n_seq-1)//n_seq_block_size + 1))
         # xElemConst
         # Get aperture origin (for the given aperture center element/aperture center)
         tx_rx_params = arrus.kernels.imaging.preprocess_sequence_parameters(probe_model, seq)
@@ -864,7 +870,7 @@ def process(self, data):
         data = self.num_pkg.ascontiguousarray(data)
         params = (
             self.output_buffer, data,
-            self.n_tx, self.n_rx, self.n_samples,
+            self.n_seq, self.n_tx, self.n_rx, self.n_samples,
             self.tx_angles,
             self.init_delay, self.start_time,
             self.c, self.fs, self.fc, self.max_tang)
@@ -914,7 +920,7 @@ def prepare(self, const_metadata: arrus.metadata.ConstMetadata):
         tx_rx_params = arrus.kernels.imaging.preprocess_sequence_parameters(probe_model, seq)
         rx_aperture_center_element = np.array(tx_rx_params["tx_ap_cent"])
 
-        self.n_tx, self.n_rx, self.n_samples = const_metadata.input_shape
+        self.n_seq, self.n_tx, self.n_rx, self.n_samples = const_metadata.input_shape
         self.is_iq = const_metadata.is_iq_data
         if self.is_iq:
             buffer_dtype = self.xp.complex64
@@ -922,8 +928,9 @@ def prepare(self, const_metadata: arrus.metadata.ConstMetadata):
             buffer_dtype = self.xp.float32
 
         # -- Output buffer
-        self.buffer = self.xp.zeros((self.n_tx, self.n_rx * self.n_samples),
-                                    dtype=buffer_dtype)
+        self.buffer = self.xp.zeros(
+            (self.n_seq*self.n_tx, self.n_rx * self.n_samples),
+            dtype=buffer_dtype)
 
         # -- Delays
         acq_fs = (const_metadata.context.device.sampling_frequency
@@ -976,7 +983,7 @@ def prepare(self, const_metadata: arrus.metadata.ConstMetadata):
             (x_distance - element_x) ** 2 + (z_distance - element_z) ** 2)
 
         self.t = (tx_distance + rx_distance) / c + initial_delay
-        self.delays = self.t * fs # in number of samples
+        self.delays = self.t * fs  # in number of samples
         total_n_samples = self.n_rx * self.n_samples
         # Move samples outside the available area
         self.delays[np.isclose(self.delays, self.n_samples-1)] = self.n_samples-1
@@ -1004,18 +1011,18 @@ def prepare(self, const_metadata: arrus.metadata.ConstMetadata):
         self.iq_correction = self.xp.exp(1j * 2 * np.pi * fc * self.t) \
             .astype(self.xp.complex64)
         # Create new output shape
-        return const_metadata.copy(input_shape=(self.n_tx, self.n_samples))
+        return const_metadata.copy(input_shape=(self.n_seq, self.n_tx, self.n_samples))
 
     def process(self, data):
-        data = data.copy().reshape(self.n_tx, self.n_rx * self.n_samples)
+        data = data.copy().reshape(self.n_seq*self.n_tx, self.n_rx * self.n_samples)
 
         self.interp1d_func(data, self.delays, self.buffer)
-        out = self.buffer.reshape((self.n_tx, self.n_rx, self.n_samples))
+        out = self.buffer.reshape((self.n_seq, self.n_tx, self.n_rx, self.n_samples))
         if self.is_iq:
             out = out * self.iq_correction
         out = out * self.rx_apodization
-        out = self.xp.sum(out, axis=1)
-        return out.reshape((self.n_tx, self.n_samples))
+        out = self.xp.sum(out, axis=2)
+        return out.reshape((self.n_seq, self.n_tx, self.n_samples))
 
 
 class EnvelopeDetection(Operation):
@@ -1141,7 +1148,7 @@ def _prepare_linear_array(self, const_metadata: arrus.metadata.ConstMetadata):
         import cupy as cp
         import cupyx.scipy.ndimage
         self.interp_function = cupyx.scipy.ndimage.map_coordinates
-        n_samples, n_scanlines = const_metadata.input_shape
+        self.n_frames, n_samples, n_scanlines = const_metadata.input_shape
         seq = const_metadata.context.sequence
         if not isinstance(seq, arrus.ops.imaging.LinSequence):
             raise ValueError("Scan conversion works only with LinSequence.")
@@ -1176,20 +1183,22 @@ def _prepare_linear_array(self, const_metadata: arrus.metadata.ConstMetadata):
         interp_z_grid = (self.z_grid-input_z_grid_origin)/input_z_grid_diff
         self._interp_mesh = cp.asarray(np.meshgrid(interp_z_grid, interp_x_grid, indexing="ij"))
 
-        self.dst_shape = len(self.z_grid.squeeze()), len(self.x_grid.squeeze())
+        self.dst_shape = self.n_frames, len(self.z_grid.squeeze()), len(self.x_grid.squeeze())
+        self.buffer = cp.zeros(self.dst_shape, dtype=cp.float32)
         return const_metadata.copy(input_shape=self.dst_shape)
 
     def _process_linear_array(self, data):
-        return self.interp_function(data, self._interp_mesh, order=1)
+        for i in range(self.n_frames):
+            self.buffer[i] = self.interp_function(data[i], self._interp_mesh, order=1)
+        return self.buffer
 
     def _prepare_convex(self, const_metadata: arrus.metadata.ConstMetadata):
         probe = const_metadata.context.device.probe.model
         medium = const_metadata.context.medium
         data_desc = const_metadata.data_description
 
-        n_samples, _ = const_metadata.input_shape
+        self.n_frames, n_samples, n_scanlines = const_metadata.input_shape
         seq = const_metadata.context.sequence
-        custom_data = const_metadata.context.custom_data
 
         acq_fs = (const_metadata.context.device.sampling_frequency
                   / seq.downsampling_factor)
@@ -1220,24 +1229,26 @@ def _prepare_convex(self, const_metadata: arrus.metadata.ConstMetadata):
         dst_points = np.dstack((radGridOut, azimuthGridOut))
         w, h, d = dst_points.shape
         self.dst_points = dst_points.reshape((w * h, d))
-        self.dst_shape = len(self.z_grid.squeeze()), len(self.x_grid.squeeze())
+        self.dst_shape = self.n_frames, len(self.z_grid.squeeze()), len(self.x_grid.squeeze())
+        self.output_buffer = np.zeros(self.dst_shape, dtype=np.float32)
         return const_metadata.copy(input_shape=self.dst_shape)
 
     def _process_convex(self, data):
         if self.is_gpu:
             data = data.get()
         data[np.isnan(data)] = 0.0
-        self.interpolator = scipy.interpolate.RegularGridInterpolator(
-            (self.radGridIn, self.azimuthGridIn), data, method="linear",
-            bounds_error=False, fill_value=0)
-        result = self.interpolator(self.dst_points).reshape(self.dst_shape)
-        return self.num_pkg.asarray(result).astype(np.float32)
+        for i in range(self.n_frames):
+            self.interpolator = scipy.interpolate.RegularGridInterpolator(
+                (self.radGridIn, self.azimuthGridIn), data[i], method="linear",
+                bounds_error=False, fill_value=0)
+            self.output_buffer[i] = self.interpolator(self.dst_points).reshape(self.dst_shape[1:])
+        return self.num_pkg.asarray(self.output_buffer).astype(np.float32)
 
     def _prepare_phased_array(self, const_metadata: arrus.metadata.ConstMetadata):
         probe = const_metadata.context.device.probe.model
         data_desc = const_metadata.data_description
 
-        n_samples, _ = const_metadata.input_shape
+        self.n_frames, n_samples, n_scanlines = const_metadata.input_shape
         seq = const_metadata.context.sequence
         fs = const_metadata.context.device.sampling_frequency
         acq_fs = fs / seq.downsampling_factor
@@ -1262,18 +1273,21 @@ def _prepare_phased_array(self, const_metadata: arrus.metadata.ConstMetadata):
         dst_points = np.dstack((radGridOut, azimuthGridOut))
         w, h, d = dst_points.shape
         self.dst_points = dst_points.reshape((w * h, d))
-        self.dst_shape = len(self.z_grid.squeeze()), len(self.x_grid.squeeze())
+        self.dst_shape = self.n_frames, len(self.z_grid.squeeze()), len(self.x_grid.squeeze())
+        self.output_buffer = np.zeros(self.dst_shape, dtype=np.float32)
         return const_metadata.copy(input_shape=self.dst_shape)
 
     def _process_phased_array(self, data):
         if self.is_gpu:
             data = data.get()
         data[np.isnan(data)] = 0.0
-        self.interpolator = scipy.interpolate.RegularGridInterpolator(
-            (self.radGridIn, self.azimuthGridIn), data, method="linear",
-            bounds_error=False, fill_value=0)
-        result = self.interpolator(self.dst_points).reshape(self.dst_shape)
-        return self.num_pkg.asarray(result).astype(np.float32)
+        for i in range(self.n_frames):
+            self.interpolator = scipy.interpolate.RegularGridInterpolator(
+                (self.radGridIn, self.azimuthGridIn), data[i], method="linear",
+                bounds_error=False, fill_value=0)
+            result = self.interpolator(self.dst_points).reshape(self.dst_shape[1:])
+            self.output_buffer[i] = result
+        return self.num_pkg.asarray(self.output_buffer).astype(np.float32)
 
 
 class LogCompression(Operation):
@@ -1619,172 +1633,6 @@ def process(self, data):
         return self.xp.squeeze(data)
 
 
-class RxBeamformingImg(Operation):
-    """
-    Rx beamforming for synthetic aperture imaging.
-
-    Expected input data shape: n_emissions, n_rx, n_samples
-
-    Currently Plane Wave Imaging (Pwi) is supported only.
-    """
-    def __init__(self, x_grid, z_grid, num_pkg=None):
-        self.x_grid = x_grid
-        self.z_grid = z_grid
-        self.delays = None
-        self.buffer = None
-        self.rx_apodization = None
-        self.xp = num_pkg
-        self.interp1d_func = None
-
-    def set_pkgs(self, num_pkg, **kwargs):
-        self.xp = num_pkg
-        if self.xp is np:
-            import scipy.interpolate
-
-            def numpy_interp1d(input, samples, output):
-                n_samples = input.shape[-1]
-                x = np.arange(0, n_samples)
-                interpolator = scipy.interpolate.interp1d(
-                    x, input, kind="linear", bounds_error=False,
-                    fill_value=0.0)
-                interpolated_values = interpolator(samples)
-                output[:] = interpolated_values
-
-            self.interp1d_func = numpy_interp1d
-        else:
-            import cupy as cp
-            if self.xp != cp:
-                raise ValueError(f"Unhandled numerical package: {self.xp}")
-            import arrus.utils.interpolate
-            self.interp1d_func = arrus.utils.interpolate.interp1d
-
-    def prepare(self, const_metadata: arrus.metadata.ConstMetadata):
-        probe_model = const_metadata.context.device.probe.model
-
-        if probe_model.is_convex_array():
-            raise ValueError("PWI reconstruction mode is available for "
-                             "linear arrays only.")
-
-        seq = const_metadata.context.sequence
-        medium = const_metadata.context.medium
-
-        self.n_tx, self.n_rx, self.n_samples = const_metadata.input_shape
-        self.is_iq = const_metadata.is_iq_data
-        if self.is_iq:
-            self.buffer_dtype = self.xp.complex64
-        else:
-            self.buffer_dtype = self.xp.float32
-
-        # -- Output buffer
-        x_size = len(self.x_grid.flatten())
-        z_size = len(self.z_grid.flatten())
-        self.buffer_shape = (self.n_rx, x_size, z_size)
-        self.buffer = self.xp.zeros(self.buffer_shape, dtype=self.buffer_dtype)\
-            .flatten()
-        self.buffer = self.xp.atleast_2d(self.buffer)
-        self.lri_buffer = self.xp.zeros((self.n_tx, x_size, z_size),
-                                        dtype=self.buffer_dtype)
-
-        # -- Delays
-
-        # --- Initial delay
-        acq_fs = (const_metadata.context.device.sampling_frequency
-                  / seq.downsampling_factor)
-        fs = const_metadata.data_description.sampling_frequency
-        fc = seq.pulse.center_frequency
-        n_periods = seq.pulse.n_periods
-        if seq.speed_of_sound is not None:
-            c = seq.speed_of_sound
-        else:
-            c = medium.speed_of_sound
-
-        angles = np.atleast_1d(np.array(seq.angles))
-        angles = np.expand_dims(angles, axis=(1, 2, 3))  # (ntx, 1, 1, 1)
-        tx_delay_center = 0.5*(probe_model.n_elements-1)*probe_model.pitch*np.abs(np.tan(angles))/c
-        tx_delay_center = np.squeeze(tx_delay_center)
-
-        start_sample = seq.rx_sample_range[0]
-        burst_factor = n_periods / (2 * fc)
-        initial_delay = (- start_sample / acq_fs
-                         + tx_delay_center
-                         + burst_factor)
-        initial_delay = np.array(initial_delay)
-        initial_delay = initial_delay[..., np.newaxis, np.newaxis, np.newaxis]
-
-        # --- Distances and apodizations
-        lambd = c / fc
-        max_tang = math.tan(
-            math.asin(min(1, 2 / 3 * lambd / probe_model.pitch)))
-
-        element_pos_x = probe_model.element_pos_x
-        rx_aperture_origin = np.zeros(self.n_tx, dtype=np.int16)
-        rx_aperture_origin = np.expand_dims(rx_aperture_origin, axis=(1, 2, 3))
-        # (ntx, 1, 1, 1)
-        # TODO parametrize rx aperture size
-        rx_aperture_size = probe_model.n_elements
-        irx = np.arange(0, probe_model.n_elements)
-        irx = np.expand_dims(irx, axis=(0, 2, 3))  # (1, nrx, 1, 1)
-        itx = np.expand_dims(np.arange(0, self.n_tx), axis=(1, 2, 3))
-        rx_aperture_element_pos_x = (rx_aperture_origin+irx -
-                                     (probe_model.n_elements-1)/2)*probe_model.pitch
-
-
-        # Output delays/apodization
-        x_grid = np.expand_dims(self.x_grid, axis=(0, 1, 3))
-        # (1, 1, x_size, 1)
-        z_grid = np.expand_dims(self.z_grid, axis=(0, 1, 2))
-
-        # (1, 1, 1, z_size)
-        tx_distance = x_grid*np.sin(angles) + z_grid*np.cos(angles)
-        # (ntx, 1, x_size, z_size)
-        r1 = (x_grid-element_pos_x[0])*np.cos(angles) - z_grid*np.sin(angles)
-        r2 = (x_grid-element_pos_x[-1])*np.cos(angles) - z_grid*np.sin(angles)
-        tx_apodization = np.logical_and(r1 >= 0, r2 <= 0).astype(np.int8)
-
-        rx_distance = np.sqrt((x_grid-rx_aperture_element_pos_x)**2 + z_grid**2)
-        # (ntx, nrx, x_size, z_size)
-        # rx_distance = np.expand_dims(rx_distance, axis=(0, 1))
-        rx_tangens = np.abs(x_grid - rx_aperture_element_pos_x)/(z_grid+1e-12)
-        rx_apodization = (rx_tangens < max_tang).astype(np.int8)
-
-        delay_total = (tx_distance + rx_distance)/c + initial_delay
-        samples = delay_total * fs
-        # samples outside should be neglected
-        samples[np.logical_or(samples < 0, samples >= self.n_samples-1)] = np.Inf
-        samples = samples + irx*self.n_samples
-        samples[np.isinf(samples)] = -1
-        samples = samples.astype(np.float32)
-        rx_weights = tx_apodization*rx_apodization
-
-        # IQ correction
-        if self.is_iq:
-            t = self.xp.asarray(delay_total)
-            rx_weights = self.xp.asarray(rx_weights)
-            self.iq_correction = self.xp.exp(1j*2*np.pi*fc*t)\
-                .astype(self.xp.complex64)
-            self.rx_weights = self.iq_correction * rx_weights
-        else:
-            self.rx_weights = self.xp.asarray(rx_weights)
-        tx_weights = np.sum(rx_weights, axis=1)
-        self.rx_weights = self.rx_weights.astype(self.xp.complex64)
-        self.tx_weights = self.xp.asarray(tx_weights).astype(self.xp.float32)
-        self.samples = self.xp.asarray(samples).astype(self.xp.float32)
-        # Create new output shape
-        return const_metadata.copy(input_shape=(len(self.x_grid),
-                                                len(self.z_grid)))
-
-    def process(self, data):
-        data = data.copy().reshape(self.n_tx, self.n_rx*self.n_samples)
-        for i in range(self.n_tx):
-            self.interp1d_func(data[i:(i+1)], self.samples[i:(i+1)].flatten(),
-                               self.buffer)
-            rf_rx = self.buffer.reshape(self.buffer_shape)
-            rf_rx = rf_rx * self.rx_weights[i]
-            rf_rx = np.sum(rf_rx, axis=0)
-            self.lri_buffer[i] = rf_rx
-        return np.sum(self.lri_buffer, axis=0)/np.sum(self.tx_weights, axis=0)
-
-
 class ReconstructLri(Operation):
     """
     Rx beamforming for synthetic aperture imaging.
@@ -1808,7 +1656,6 @@ def set_pkgs(self, num_pkg, **kwargs):
             raise ValueError("ReconstructLri operation is implemented for GPU only.")
 
     def prepare(self, const_metadata):
-
         import cupy as cp
 
         current_dir = os.path.dirname(os.path.join(os.path.abspath(__file__)))
@@ -1820,7 +1667,7 @@ def prepare(self, const_metadata):
 
         # INPUT PARAMETERS.
         # Input data shape.
-        self.n_tx, self.n_rx, self.n_samples = const_metadata.input_shape
+        self.n_seq, self.n_tx, self.n_rx, self.n_samples = const_metadata.input_shape
 
         seq = const_metadata.context.sequence
         probe_model = const_metadata.context.device.probe.model
@@ -1829,7 +1676,7 @@ def prepare(self, const_metadata):
 
         self.x_size = len(self.x_grid)
         self.z_size = len(self.z_grid)
-        output_shape = (self.n_tx, self.x_size, self.z_size)
+        output_shape = (self.n_seq, self.n_tx, self.x_size, self.z_size)
         self.output_buffer = self.num_pkg.zeros(output_shape, dtype=self.num_pkg.complex64)
         x_block_size = min(self.x_size, 16)
         z_block_size = min(self.z_size, 16)
@@ -1837,7 +1684,7 @@ def prepare(self, const_metadata):
         self.block_size = (z_block_size, x_block_size, tx_block_size)
         self.grid_size = (int((self.z_size-1)//z_block_size + 1),
                           int((self.x_size-1)//x_block_size + 1),
-                          int((self.n_tx-1)//tx_block_size + 1))
+                          int((self.n_seq*self.n_tx-1)//tx_block_size + 1))
         self.x_pix = self.num_pkg.asarray(self.x_grid, dtype=self.num_pkg.float32)
         self.z_pix = self.num_pkg.asarray(self.z_grid, dtype=self.num_pkg.float32)
 
@@ -1855,7 +1702,7 @@ def prepare(self, const_metadata):
         self.n_elements = probe_model.n_elements
 
         device_props = cp.cuda.runtime.getDeviceProperties(0)
-        if device_props["totalConstMem"] < 256*3*4: # 3 float32 arrays, 256 elements max
+        if device_props["totalConstMem"] < 256*3*4:  # 3 float32 arrays, 256 elements max
             raise ValueError("There is not enough constant memory available!")
 
         x_elem = np.asarray(element_pos_x, dtype=self.num_pkg.float32)
@@ -1902,8 +1749,6 @@ def prepare(self, const_metadata):
 
         self.min_tang = self.num_pkg.float32(self.min_tang)
         self.max_tang = self.num_pkg.float32(self.max_tang)
-
-        # PWI specific
         self.tx_foc = self.num_pkg.asarray([seq.tx_focus]*self.n_tx, dtype=self.num_pkg.float32)
         burst_factor = seq.pulse.n_periods / (2*self.fn)
         self.initial_delay = -start_sample/65e6+burst_factor+tx_center_delay
@@ -1915,7 +1760,8 @@ def process(self, data):
         params = (
             self.output_buffer,
             data,
-            self.n_elements, self.n_tx, self.n_samples,
+            self.n_elements,
+            self.n_seq, self.n_tx, self.n_samples,
             self.z_pix, self.z_size,
             self.x_pix, self.x_size,
             self.sos, self.fs, self.fn,
diff --git a/api/python/arrus/utils/interpolate.py b/api/python/arrus/utils/interpolate.py
index 2eac6a2dd..1d0fdaaa7 100644
--- a/api/python/arrus/utils/interpolate.py
+++ b/api/python/arrus/utils/interpolate.py
@@ -20,7 +20,6 @@
         if(xt >= outputWidth) {
             return;
         }
-
         float samplePos = samples[xt]; 
         int sampleNr = floorf(samplePos);
         float ratio = samplePos - sampleNr; 
@@ -71,4 +70,4 @@ def interp1d(input_data, samples, output_data):
     elif input_data.dtype == cp.float32:
         return _interp1d_kernel_float32(gridSize, blockSize, params)
     else:
-        raise ValueError(f"Unsupported data type: {input_data.dtype}")
\ No newline at end of file
+        raise ValueError(f"Unsupported data type: {input_data.dtype}")
diff --git a/api/python/arrus/utils/iq_raw_2_lri.cu b/api/python/arrus/utils/iq_raw_2_lri.cu
index 45bfed761..886c26cb0 100644
--- a/api/python/arrus/utils/iq_raw_2_lri.cu
+++ b/api/python/arrus/utils/iq_raw_2_lri.cu
@@ -9,7 +9,8 @@ __constant__ float tangElemConst[256];
 extern "C"
 __global__ void
 iqRaw2Lri(complex<float> *iqLri, const complex<float> *iqRaw,
-          const int nElem, const int nTx, const int nSamp,
+          const int nElem,
+          const int nSeq, const int nTx, const int nSamp,
           const float *zPix, const int nZPix,
           const float *xPix, const int nXPix,
           float const sos, float const fs, float const fn,
@@ -22,11 +23,12 @@ iqRaw2Lri(complex<float> *iqLri, const complex<float> *iqRaw,
 
     int z = blockIdx.x * blockDim.x + threadIdx.x;
     int x = blockIdx.y * blockDim.y + threadIdx.y;
-    int iTx = blockIdx.z * blockDim.z + threadIdx.z;
+    int iGlobalTx = blockIdx.z * blockDim.z + threadIdx.z;
 
-    if(z >= nZPix || x >= nXPix || iTx >= nTx) {
+    if(z >= nZPix || x >= nXPix || iGlobalTx >= nSeq*nTx) {
         return;
     }
+    int iTx = iGlobalTx % nTx;
 
     int iElem, offset;
     float interpWgh;
@@ -40,7 +42,7 @@ iqRaw2Lri(complex<float> *iqLri, const complex<float> *iqRaw,
     const float centRxTang = (maxRxTang + minRxTang) * 0.5f;
     complex<float> pix(0.0f, 0.0f), samp(0.0f, 0.0f), modFactor;
 
-    int txOffset = iTx * nSamp * nRx;
+    int txOffset = iGlobalTx * nSamp * nRx;
 
     if(!isinf(txFoc[iTx])) {
         /* STA */
@@ -119,8 +121,8 @@ iqRaw2Lri(complex<float> *iqLri, const complex<float> *iqRaw,
         }
     }
     if(pixWgh == 0.0f) {
-        iqLri[z + x * nZPix + iTx * nZPix * nXPix] = complex<float>(0.0f, 0.0f);
+        iqLri[z + x*nZPix + iGlobalTx*nZPix*nXPix] = complex<float>(0.0f, 0.0f);
     } else {
-        iqLri[z + x * nZPix + iTx * nZPix * nXPix] = pix / pixWgh * txApod;
+        iqLri[z + x*nZPix + iGlobalTx*nZPix*nXPix] = pix / pixWgh * txApod;
     }
 }
\ No newline at end of file
diff --git a/api/python/arrus/utils/rx_beamforming.cu b/api/python/arrus/utils/rx_beamforming.cu
index 3d3c163f3..b0a7c9d74 100644
--- a/api/python/arrus/utils/rx_beamforming.cu
+++ b/api/python/arrus/utils/rx_beamforming.cu
@@ -10,7 +10,7 @@ __constant__ float xElemConst[256]; // [m]
 // to the center of TX/RX aperture
 extern "C"
 __global__ void beamformPhasedArray(complex<float> *output, const complex<float> *input,
-                                    const unsigned nTx, const unsigned nRx, const unsigned nSamples,
+                                    const unsigned nSeq, const unsigned nTx, const unsigned nRx, const unsigned nSamples,
                                     const float *txAngles, // [rad]
                                     const float initDelay, const float startTime,
                                     const float c, const float fs, const float fc, float maxApodTang) {
@@ -21,13 +21,15 @@ __global__ void beamformPhasedArray(complex<float> *output, const complex<float>
     float modSin, modCos;
     unsigned signalOffset;
     int sInt;
-    unsigned sample = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned scanline = blockIdx.y * blockDim.y + threadIdx.y;
     complex<float> result = complex<float>(0.0f, 0.0f);
     complex<float> currentResult;
     complex<float> modFactor;
 
-    if(sample >= nSamples || scanline >= nTx) {
+    unsigned sample = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned scanline = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned frame = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if(sample >= nSamples || scanline >= nTx || frame >= nSeq) {
         return;
     }
 
@@ -41,7 +43,7 @@ __global__ void beamformPhasedArray(complex<float> *output, const complex<float>
     float pointZ = r*txAngleCos;
     txDistance = r;
 
-    unsigned txOffset = scanline*nSamples*nRx;
+    unsigned txOffset = frame*nTx*nRx*nSamples + scanline*nRx*nSamples;
     float cInv = 1/c;
 
     for(unsigned element = 0; element < nRx; ++element) {
@@ -79,8 +81,8 @@ __global__ void beamformPhasedArray(complex<float> *output, const complex<float>
         ++pixWgh;
     }
     if(pixWgh == 0.0f) {
-        output[sample + scanline*nSamples] = complex<float>(0.0f, 0.0f);
+        output[frame*nTx*nSamples + scanline*nSamples + sample] = complex<float>(0.0f, 0.0f);
     } else {
-        output[sample + scanline*nSamples] = result/pixWgh;
+        output[frame*nTx*nSamples + scanline*nSamples + sample] = result/pixWgh;
     }
 }
\ No newline at end of file
diff --git a/api/python/examples/reconstruct_lin.py b/api/python/examples/reconstruct_lin.py
new file mode 100644
index 000000000..b2b710c29
--- /dev/null
+++ b/api/python/examples/reconstruct_lin.py
@@ -0,0 +1,58 @@
+import arrus
+import pickle
+import numpy as np
+import cupy as cp
+import matplotlib.pyplot as plt
+
+from arrus.utils.imaging import (
+    get_bmode_imaging, get_extent,
+    RxBeamforming,
+    ScanConversion,
+    Pipeline,
+    RemapToLogicalOrder,
+    SelectSequence,
+    Squeeze,
+    Lambda,
+    Transpose,
+    BandpassFilter,
+    QuadratureDemodulation,
+    Decimation,
+    ReconstructLri,
+    Mean,
+    EnvelopeDetection,
+    LogCompression
+)
+
+data = pickle.load(open("classical_data.pkl", "rb"))
+bmode, rf = data["data"]
+bmode_metadata, rf_metadata = data["metadata"]
+
+x_grid = np.arange(-15, 15, 0.1) * 1e-3
+z_grid = np.arange(5, 35, 0.1) * 1e-3
+
+pipeline = Pipeline(
+    steps=(
+        # Channel data pre-processing.
+        RemapToLogicalOrder(),
+        Transpose(axes=(0, 1, 3, 2)),
+        BandpassFilter(),
+        QuadratureDemodulation(),
+        Lambda(lambda data: (print(data.shape), data)[1]),
+        # Decimation(decimation_factor=4, cic_order=2),
+        # # Data beamforming.
+        # RxBeamforming(),
+        # # Post-processing to B-mode image.
+        # EnvelopeDetection(),
+        # Transpose(axes=(0, 2, 1)),
+        # ScanConversion(x_grid, z_grid),
+        # LogCompression()
+    ),
+    placement="/GPU:0")
+
+print(rf.shape)
+pipeline.prepare(rf_metadata)
+bmode = pipeline.process(cp.asarray(rf))[0].get()
+print(bmode.shape)
+
+plt.imshow(bmode, cmap="gray")
+plt.show()
diff --git a/docs/content/misc/release_notes.rst b/docs/content/misc/release_notes.rst
index b4d185c0d..c64bdf11f 100644
--- a/docs/content/misc/release_notes.rst
+++ b/docs/content/misc/release_notes.rst
@@ -1,6 +1,19 @@
 Release notes
 =============
 
+0.7.x
+-----
+
+
+0.7.0
+.....
+
+- Python API:
+
+    - added n_repeats parameter to the simple TX/RX sequences available in ARRUS, the parameter allows to specify the number of times the sequence should be repeated ("batch size"),
+    - arrus.utils.imaging reconstruction operators now requires that the input data are in the format (batch size, n TX, n RX, n samples),
+    - arrus.utils.imaging.RxBeamformingImg is no longer available. Please use ReconstructLri + some form of compouding (e.g. Mean(axis=1)),
+
 0.6.x
 -----