From c49ef3195ac67963da3833d75d0299dd0b8a04db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ante=20Jukic=CC=81?= Date: Sat, 17 Dec 2022 00:40:02 -0800 Subject: [PATCH] Update RIR generation scripts - fix: reduce room size if evaluation of params fails - added randomized mic placement - added diffuse noise generation - added an option to specify the format and subtype for saved audio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ante Jukić --- nemo/collections/asr/data/data_simulation.py | 916 +++++++++++------- .../asr/parts/utils/audio_utils.py | 8 +- .../asr/test_asr_data_simulation.py | 40 +- .../collections/asr/utils/test_audio_utils.py | 11 +- 4 files changed, 616 insertions(+), 359 deletions(-) diff --git a/nemo/collections/asr/data/data_simulation.py b/nemo/collections/asr/data/data_simulation.py index 4899c9097f4f..5bbdcdfb5605 100644 --- a/nemo/collections/asr/data/data_simulation.py +++ b/nemo/collections/asr/data/data_simulation.py @@ -13,8 +13,10 @@ # limitations under the License. import concurrent +import itertools import multiprocessing import os +import random import warnings from typing import Dict, Iterable, List, Optional, Tuple, Union @@ -33,7 +35,7 @@ from nemo.collections.asr.parts.preprocessing.perturb import process_augmentations from nemo.collections.asr.parts.preprocessing.segment import AudioSegment -from nemo.collections.asr.parts.utils.audio_utils import db2mag, mag2db, pow2db, rms +from nemo.collections.asr.parts.utils.audio_utils import db2mag, generate_approximate_noise_field, mag2db, pow2db, rms from nemo.collections.asr.parts.utils.data_simulation_utils import ( DataAnnotator, SpeechSampler, @@ -1993,6 +1995,9 @@ def convert_placement_to_range( if not np.all(np.array(room_dim) > 0): raise ValueError(f'Room dimensions must be positive: {room_dim}') + if object_radius < 0: + raise ValueError(f'Object radius must be non-negative: {object_radius}') + placement_range = [None] * 3 min_to_wall = placement.get('min_to_wall', 0) @@ -2117,7 +2122,13 @@ def check_cfg(self): if mic_cfg is None: raise ValueError('Mic configuration not provided') - for key in ['positions', 'placement', 'orientation']: + if mic_cfg.get('positions') == 'random': + # Only num_mics and placement are required + mic_cfg_keys = ['num_mics', 'placement'] + else: + mic_cfg_keys = ['positions', 'placement', 'orientation'] + + for key in mic_cfg_keys: if key not in mic_cfg: raise ValueError(f'Mic array {key} not provided') @@ -2148,27 +2159,7 @@ def generate_room_params(self) -> dict: room_cfg = self.cfg.room - # width, length, height - room_dim = np.zeros(3) - - # prepare dimensions - for idx, key in enumerate(['width', 'length', 'height']): - # get configured dimension - dim = room_cfg.dim[key] - - # set a value - if dim is None: - raise ValueError(f'Room {key} needs to be a scalar or a range, currently it is None') - elif np.isscalar(dim): - assert dim > 0, f'Dimension should be positive for {key}: {dim}' - room_dim[idx] = dim - elif len(dim) == 2: - assert 0 < dim[0] <= dim[1], f'Expecting two non-decreasing values for {key}, received {dim}' - room_dim[idx] = self.random.uniform(low=dim[0], high=dim[1]) - else: - raise ValueError(f'Unexpected value for {key}: {dim}') - - # prepare rt60 + # Prepare rt60 if room_cfg.rt60 is None: raise ValueError(f'Room RT60 needs to be a scalar or a range, currently it is None') @@ -2183,8 +2174,45 @@ def generate_room_params(self) -> dict: else: raise ValueError(f'Unexpected value for RT60: {room_cfg.rt60}') - # Get parameters from size and RT60 - room_absorption, room_max_order = pra.inverse_sabine(rt60, room_dim) + # Generate a room with random dimensions + num_retries = self.cfg.get('num_retries', 20) + + for n in range(num_retries): + + # width, length, height + room_dim = np.zeros(3) + + # prepare dimensions + for idx, key in enumerate(['width', 'length', 'height']): + # get configured dimension + dim = room_cfg.dim[key] + + # set a value + if dim is None: + raise ValueError(f'Room {key} needs to be a scalar or a range, currently it is None') + elif np.isscalar(dim): + assert dim > 0, f'Dimension should be positive for {key}: {dim}' + room_dim[idx] = dim + elif len(dim) == 2: + assert 0 < dim[0] <= dim[1], f'Expecting two non-decreasing values for {key}, received {dim}' + # Reduce dimension if the previous attempt failed + room_dim[idx] = self.random.uniform(low=dim[0], high=dim[1] - n * (dim[1] - dim[0]) / num_retries) + else: + raise ValueError(f'Unexpected value for {key}: {dim}') + + try: + # Get parameters from size and RT60 + room_absorption, room_max_order = pra.inverse_sabine(rt60, room_dim) + break + except Exception as e: + logging.debug('Inverse sabine failed: %s', str(e)) + # Inverse sabine may fail if the room is too large for the selected RT60. + # Try again by generate a smaller room. + room_absorption = room_max_order = None + continue + + if room_absorption is None or room_max_order is None: + raise RuntimeError(f'Evaluation of parameters failed for RT60 {rt60}s and room size {room_dim}.') # Return the required values room_params = { @@ -2208,43 +2236,64 @@ def generate_array(self, room_dim: Iterable[float]) -> ArrayGeometry: Randomly placed microphone array. """ mic_cfg = self.cfg.mic_array - mic_array = ArrayGeometry(mic_cfg.positions) - # Randomize center placement - center = np.zeros(3) - placement_range = convert_placement_to_range( - placement=mic_cfg.placement, room_dim=room_dim, object_radius=mic_array.radius - ) + if mic_cfg.positions == 'random': + # Create a radom set of microphones + num_mics = mic_cfg.num_mics + mic_positions = [] - for idx in range(len(center)): - center[idx] = self.random.uniform(low=placement_range[idx][0], high=placement_range[idx][1]) + # Each microphone is placed individually + placement_range = convert_placement_to_range( + placement=mic_cfg.placement, room_dim=room_dim, object_radius=0 + ) - # Place the array at the configured center point - mic_array.translate(to=center) + # Randomize mic placement + for m in range(num_mics): + position_m = [None] * 3 + for idx in range(3): + position_m[idx] = self.random.uniform(low=placement_range[idx][0], high=placement_range[idx][1]) + mic_positions.append(position_m) - # Randomize orientation - orientation = dict() - for key in ['yaw', 'roll', 'pitch']: - # angle for current orientation - angle = mic_cfg.orientation[key] + mic_array = ArrayGeometry(mic_positions) - if angle is None: - raise ValueError(f'Mic array {key} should be a scalar or a range, currently it is set to None.') + else: + mic_array = ArrayGeometry(mic_cfg.positions) - # check it's within the expected range - check_angle(key, angle) + # Randomize center placement + center = np.zeros(3) + placement_range = convert_placement_to_range( + placement=mic_cfg.placement, room_dim=room_dim, object_radius=mic_array.radius + ) - if np.isscalar(angle): - orientation[key] = angle - elif len(angle) == 2: - assert angle[0] <= angle[1], f"Expecting two non-decreasing values for {key}, received {angle}" - # generate integer values, for easier bucketing, if necessary - orientation[key] = self.random.uniform(low=angle[0], high=angle[1]) - else: - raise ValueError(f'Unexpected value for orientation {key}: {angle}') + for idx in range(len(center)): + center[idx] = self.random.uniform(low=placement_range[idx][0], high=placement_range[idx][1]) + + # Place the array at the configured center point + mic_array.translate(to=center) + + # Randomize orientation + orientation = dict() + for key in ['yaw', 'roll', 'pitch']: + # angle for current orientation + angle = mic_cfg.orientation[key] + + if angle is None: + raise ValueError(f'Mic array {key} should be a scalar or a range, currently it is set to None.') - # Rotate the array to match the selected orientation - mic_array.rotate(**orientation) + # check it's within the expected range + check_angle(key, angle) + + if np.isscalar(angle): + orientation[key] = angle + elif len(angle) == 2: + assert angle[0] <= angle[1], f"Expecting two non-decreasing values for {key}, received {angle}" + # generate integer values, for easier bucketing, if necessary + orientation[key] = self.random.uniform(low=angle[0], high=angle[1]) + else: + raise ValueError(f'Unexpected value for orientation {key}: {angle}') + + # Rotate the array to match the selected orientation + mic_array.rotate(**orientation) return mic_array @@ -2326,9 +2375,12 @@ def generate(self): examples.append(example) # Simulation - if self.num_workers is not None and self.num_workers > 1: - logging.info(f'Simulate using {self.num_workers} workers') - with multiprocessing.Pool(processes=self.num_workers) as pool: + if (num_workers := self.cfg.get('num_workers')) is None: + num_workers = os.cpu_count() - 1 + + if num_workers > 1: + logging.info(f'Simulate using {num_workers} workers') + with multiprocessing.Pool(processes=num_workers) as pool: metadata = list(tqdm(pool.imap(simulate_room_kwargs, examples), total=len(examples))) else: @@ -2810,22 +2862,23 @@ def check_cfg(self): # interference configuration interference_cfg = self.cfg.get('interference') if not interference_cfg: - raise ValueError( - 'Interference configuration not provided. Expecting audio manifests in format {subset: path_to_manifest}' - ) - interference_probability = interference_cfg.get('interference_probability', 0) - max_num_interferers = interference_cfg.get('max_num_interferers', 0) - min_azimuth_to_target = interference_cfg.get('min_azimuth_to_target', 0) - if interference_probability is not None: - if interference_probability < 0: - raise ValueError(f'Interference probability must be non-negative. Current value: {interference_prob}') - elif interference_probability > 0: - assert ( - max_num_interferers is not None and max_num_interferers > 0 - ), f'Max number of interferers must be positive. Current value: {max_num_interferers}' - assert ( - min_azimuth_to_target is not None and min_azimuth_to_target >= 0 - ), f'Min azimuth to target must be non-negative' + logging.info('Interference configuration not provided.') + else: + interference_probability = interference_cfg.get('interference_probability', 0) + max_num_interferers = interference_cfg.get('max_num_interferers', 0) + min_azimuth_to_target = interference_cfg.get('min_azimuth_to_target', 0) + if interference_probability is not None: + if interference_probability < 0: + raise ValueError( + f'Interference probability must be non-negative. Current value: {interference_prob}' + ) + elif interference_probability > 0: + assert ( + max_num_interferers is not None and max_num_interferers > 0 + ), f'Max number of interferers must be positive. Current value: {max_num_interferers}' + assert ( + min_azimuth_to_target is not None and min_azimuth_to_target >= 0 + ), f'Min azimuth to target must be non-negative' # mix configuration mix_cfg = self.cfg.get('mix') @@ -2836,71 +2889,6 @@ def check_cfg(self): if 'ref_mic_rms' not in mix_cfg: raise ValueError('Reference microphone RMS not defined.') - def get_audio_list( - self, metadata: List[dict], min_duration: float, manifest_filepath: str = None, duration_eps: float = 0.01 - ) -> List[dict]: - """Prepare a list of audio files with duration of at least min_duration. - Audio files are randomly selected from manifest metadata. - - If a selected file is longer than required duration, then a random offset is selected - before taking a min_duration segment. - If a selected file is shorter than the required duration, then a the whole file is selected - and a next file is randomly selected. - Needs manifest filepath to support relative path resolution. - - Args: - metadata: metadata loaded from a manifest file - min_duration: minimal duration for the output file - manifest_filepath: path to the manifest file, used to resolve relative paths. - For relative paths, manifest parent directory is assume to - be the base directory. - duration_eps: A small extra duration selected from each file. This is to make - sure that the signal will be long enough even if it needs to be - resampled, etc. - - Returns: - List of audio files with some metadata (offset, duration). - """ - # load a bit more than required, to compensate to floor rounding - # when loading samples from a file - total_duration = additional_duration = 0 - - audio_list = [] - - while total_duration < min_duration + additional_duration: - - data = self.random.choice(metadata) - audio_filepath = data['audio_filepath'] - if not os.path.isabs(audio_filepath) and manifest_filepath is not None: - manifest_dir = os.path.dirname(manifest_filepath) - audio_filepath = os.path.join(manifest_dir, audio_filepath) - - remaining_duration = min_duration - total_duration + additional_duration - - # select a random offset - if data['duration'] <= remaining_duration: - # take the whole noise file - offset = 0 - duration = data['duration'] - additional_duration += duration_eps - else: - # select a random offset in seconds - max_offset = data['duration'] - remaining_duration - offset = self.random.uniform(low=0, high=max_offset) - duration = remaining_duration - - audio_example = { - 'audio_filepath': audio_filepath, - 'offset': offset, - 'duration': duration, - 'type': data.get('type'), - } - - audio_list.append(audio_example) - total_duration += duration - - return audio_list - def generate_target(self, subset: str) -> dict: """ Prepare a dictionary with target configuration. @@ -2927,41 +2915,81 @@ def generate_target(self, subset: str) -> dict: Returns: Dictionary with target configuration, including room, source index, and audio information. """ + # Utility function + def select_target_source(room_metadata, room_indices): + """Find a room and a source that satisfies the constraints. + """ + for room_index in room_indices: + # Select room + room_data = room_metadata[room_index] + + # Candidate sources + sources = self.random.choice(room_data['num_sources'], size=self.num_retries, replace=False) + + # Select target source in this room + for source in sources: + # Check constraints + constraints_met = [] + for constraint in ['azimuth', 'elevation', 'distance']: + if self.cfg.target.get(constraint) is not None: + # Check that the selected source is in the range + source_value = room_data[f'source_{constraint}'][source] + if self.cfg.target[constraint][0] <= source_value <= self.cfg.target[constraint][1]: + constraints_met.append(True) + else: + constraints_met.append(False) + # No need to check the remaining constraints + break + + # Check if a feasible source is found + if all(constraints_met): + # A feasible source has been found + return source, room_index + + return None, None + # Prepare room & source position room_metadata = self.metadata[subset]['room'] - - for _ in range(self.num_retries): - # Select room - room_index = self.random.integers(low=0, high=len(room_metadata)) - room_data = room_metadata[room_index] - - # Select target source in this room - for _ in range(self.num_retries): - # Select a source for the target - source = self.random.integers(low=0, high=room_data['num_sources']) - # Check constraints - for constraint in ['azimuth', 'elevation', 'distance']: - if self.cfg.target.get(constraint) is None: - continue - else: - # Check that the selected source is in the range - source_value = room_data[f'source_{constraint}'][source] - if self.cfg.target[constraint][0] <= source_value <= self.cfg.target[constraint][1]: - continue - else: - # Pick a new one - source = None - break - - if source is not None: - # A feasible source has been found - break + room_indices = self.random.choice(len(room_metadata), size=self.num_retries, replace=False) + source, room_index = select_target_source(room_metadata, room_indices) if source is None: raise RuntimeError(f'Could not find a feasible source given target constraints {self.cfg.target}') - # Prepare audio data - audio_data = self.random.choice(self.metadata[subset]['target']) + room_data = room_metadata[room_index] + + # Optional: select subset of channels + num_available_mics = len(room_data['mic_positions']) + if 'mic_array' in self.cfg: + num_mics = self.cfg.mic_array['num_mics'] + mic_selection = self.cfg.mic_array['selection'] + + if mic_selection == 'random': + logging.debug('Randomly selecting %d mics', num_mics) + selected_mics = self.random.choice(num_available_mics, size=num_mics, replace=False) + elif isinstance(mic_selection, Iterable): + logging.debug('Using explicitly selected mics: %s', str(mic_selection)) + assert ( + 0 <= min(mic_selection) < num_available_mics + ), f'Expecting mic_selection in range [0,{num_available_mics}), current value: {mic_selection}' + selected_mics = np.array(mic_selection) + else: + raise ValueError(f'Unexpected value for mic_selection: {mic_selection}') + else: + logging.debug('Using all %d available mics', num_available_mics) + num_mics = num_available_mics + selected_mics = np.arange(num_mics) + + # Double-check the number of mics is as expected + assert ( + len(selected_mics) == num_mics + ), f'Expecting {num_mics} mics, but received {len(selected_mics)} mics: {selected_mics}' + logging.debug('Selected mics: %s', str(selected_mics)) + + # Calculate distance from the source to each microphone + mic_positions = np.array(room_data['mic_positions'])[selected_mics] + source_position = np.array(room_data['source_position'][source]) + distance_source_to_mic = np.linalg.norm(mic_positions - source_position, axis=1) # Handle relative paths room_filepath = room_data['room_filepath'] @@ -2969,49 +2997,25 @@ def generate_target(self, subset: str) -> dict: manifest_dir = os.path.dirname(self.cfg.room[subset]) room_filepath = os.path.join(manifest_dir, room_filepath) - audio_filepath = audio_data['audio_filepath'] - if not os.path.isabs(audio_filepath): - manifest_dir = os.path.dirname(self.cfg.target[subset]) - audio_filepath = os.path.join(manifest_dir, audio_filepath) - target_cfg = { 'room_index': int(room_index), 'room_filepath': room_filepath, 'source': source, 'rt60': room_data['rir_rt60_measured'][source], - 'num_mics': len(room_data['mic_positions']), + 'selected_mics': selected_mics.tolist(), + # Positions + 'source_position': source_position.tolist(), + 'mic_positions': mic_positions.tolist(), + # Relative to center of the array 'azimuth': room_data['source_azimuth'][source], 'elevation': room_data['source_elevation'][source], 'distance': room_data['source_distance'][source], - 'audio_filepath': audio_filepath, - 'text': audio_data.get('text'), - 'duration': audio_data['duration'], + # Relative to mics + 'distance_source_to_mic': distance_source_to_mic, } return target_cfg - def generate_noise(self, subset: str, target_cfg: dict) -> List[dict]: - """ - Prepare a list of dictionaries with noise configuration. - - Args: - subset: string denoting a subset which will be used to select noise audio. - target_cfg: dictionary with target configuration. This is used determine - the minimal required duration for the noise signal. - - Returns: - List of dictionary with noise configuration, including audio information - for one or more noise files. - """ - if (noise_metadata := self.metadata[subset]['noise']) is None: - return None - - noise_cfg = self.get_audio_list( - noise_metadata, min_duration=target_cfg['duration'], manifest_filepath=self.cfg.noise[subset] - ) - - return noise_cfg - def generate_interference(self, subset: str, target_cfg: dict) -> List[dict]: """ Prepare a list of dictionaries with interference configuration. @@ -3084,14 +3088,11 @@ def generate_interference(self, subset: str, target_cfg: dict) -> List[dict]: # Current source setup interfering_source = { 'source': source, + 'selected_mics': target_cfg['selected_mics'], + 'position': room_data['source_position'][source], 'azimuth': room_data['source_azimuth'][source], 'elevation': room_data['source_elevation'][source], 'distance': room_data['source_distance'][source], - 'audio': self.get_audio_list( - interference_metadata, - min_duration=target_cfg['duration'], - manifest_filepath=self.cfg.interference[subset], - ), } # Done with interference for this source @@ -3099,7 +3100,7 @@ def generate_interference(self, subset: str, target_cfg: dict) -> List[dict]: return interference_cfg - def generate_mix(self, subset: str) -> dict: + def generate_mix(self, subset: str, target_cfg: dict) -> dict: """Generate scaling parameters for mixing the target speech at the microphone, background noise and interference signal at the microphone. @@ -3114,6 +3115,7 @@ def generate_mix(self, subset: str) -> dict: Args: subset: string denoting the subset of configuration + target_cfg: dictionary with target configuration Returns: Dictionary containing configured RSNR, RSIR, ref_mic @@ -3121,13 +3123,13 @@ def generate_mix(self, subset: str) -> dict: """ mix_cfg = dict() - for key in ['rsnr', 'rsir', 'ref_mic', 'ref_mic_rms']: + for key in ['rsnr', 'rsir', 'ref_mic', 'ref_mic_rms', 'min_duration']: if key in self.cfg.mix[subset]: # Take the value from subset config - value = self.cfg.mix[subset][key] + value = self.cfg.mix[subset].get(key) else: # Take the global value - value = self.cfg.mix[key] + value = self.cfg.mix.get(key) if value is None: mix_cfg[key] = None @@ -3140,6 +3142,13 @@ def generate_mix(self, subset: str) -> dict: # Select one of the multiple values mix_cfg[key] = self.random.choice(value) + if mix_cfg['ref_mic'] == 'closest': + # Select the closest mic as the reference + mix_cfg['ref_mic'] = np.argmin(target_cfg['distance_source_to_mic']) + + # Configuration for saving individual components + mix_cfg['save'] = OmegaConf.to_object(self.cfg.mix['save']) if 'save' in self.cfg.mix else {} + return mix_cfg def generate(self): @@ -3181,9 +3190,8 @@ def generate(self): for n_example in tqdm(range(num_examples), total=num_examples, desc=f'Preparing {subset}'): # prepare configuration target_cfg = self.generate_target(subset) - noise_cfg = self.generate_noise(subset, target_cfg) interference_cfg = self.generate_interference(subset, target_cfg) - mix_cfg = self.generate_mix(subset) + mix_cfg = self.generate_mix(subset, target_cfg) # base file name base_output_filepath = os.path.join(output_dir_subset, f'{subset}_example_{n_example:09d}') @@ -3192,7 +3200,6 @@ def generate(self): example = { 'sample_rate': self.sample_rate, 'target_cfg': target_cfg, - 'noise_cfg': noise_cfg, 'interference_cfg': interference_cfg, 'mix_cfg': mix_cfg, 'base_output_filepath': base_output_filepath, @@ -3200,13 +3207,33 @@ def generate(self): examples.append(example) + # Audio data + audio_metadata = { + 'target': self.metadata[subset]['target'], + 'target_dir': os.path.dirname(self.cfg.target[subset]), # manifest_dir + 'noise': self.metadata[subset]['noise'], + 'noise_dir': os.path.dirname(self.cfg.noise[subset]), # manifest_dir + } + + if interference_cfg is not None: + audio_metadata.update( + { + 'interference': self.metadata[subset]['interference'], + 'interference_dir': os.path.dirname(self.cfg.interference[subset]), # manifest_dir + } + ) + # Simulation - if self.num_workers is not None and self.num_workers > 1: - logging.info(f'Simulate using {self.num_workers} workers') - with multiprocessing.Pool(processes=self.num_workers) as pool: + if (num_workers := self.cfg.get('num_workers')) is None: + num_workers = os.cpu_count() - 1 + + if num_workers is not None and num_workers > 1: + logging.info(f'Simulate using {num_workers} workers') + examples_and_audio_metadata = zip(examples, itertools.repeat(audio_metadata, len(examples))) + with multiprocessing.Pool(processes=num_workers) as pool: metadata = list( tqdm( - pool.imap(simulate_room_mix_kwargs, examples), + pool.imap(simulate_room_mix_helper, examples_and_audio_metadata), total=len(examples), desc=f'Simulating {subset}', ) @@ -3215,10 +3242,10 @@ def generate(self): logging.info('Simulate using a single worker') metadata = [] for example in tqdm(examples, total=len(examples), desc=f'Simulating {subset}'): - metadata.append(simulate_room_mix(**example)) + metadata.append(simulate_room_mix(**example, audio_metadata=audio_metadata)) # Save manifest - manifest_filepath = os.path.join(output_dir, f'{subset}_manifest.json') + manifest_filepath = os.path.join(output_dir, f'{os.path.basename(output_dir)}_{subset}.json') if os.path.exists(manifest_filepath) and os.path.isfile(manifest_filepath): raise RuntimeError(f'Manifest config file exists: {manifest_filepath}') @@ -3232,7 +3259,7 @@ def generate(self): write_manifest(manifest_filepath, metadata) # Generate plots with information about generated data - plot_filepath = os.path.join(output_dir, f'{subset}_info.png') + plot_filepath = os.path.join(output_dir, f'{os.path.basename(output_dir)}_{subset}_info.png') if os.path.exists(plot_filepath) and os.path.isfile(plot_filepath): raise RuntimeError(f'Plot file exists: {plot_filepath}') @@ -3269,6 +3296,7 @@ def convolve_rir(signal: np.ndarray, rir: np.ndarray) -> np.ndarray: out = np.zeros((num_samples, num_channels)) for m in range(num_channels): out[:, m] = convolve(signal, rir[:, m])[:num_samples] + else: raise RuntimeError(f'RIR with {rir.ndim} not supported') @@ -3334,7 +3362,7 @@ def simultaneously_active_rms( x: np.ndarray, y: np.ndarray, sample_rate: float, - rms_threshold_db: float = -40, + rms_threshold_db: float = -60, window_len_ms: float = 200, min_active_duration: float = 0.5, ) -> Tuple[float, float]: @@ -3421,43 +3449,198 @@ def scaled_disturbance( return scaled_disturbance -def load_audio_from_multiple_files(items: List[Dict], sample_rate: int, total_len: int) -> np.ndarray: - """Load an audio from multiple files and concatenate into a single signal. +def prepare_source_signal( + signal_type: str, + sample_rate: int, + audio_data: List[dict], + audio_dir: Optional[str] = None, + min_duration: Optional[int] = None, + ref_signal: Optional[np.ndarray] = None, + mic_positions: Optional[np.ndarray] = None, + num_retries: int = 10, +) -> tuple: + """Prepare an audio signal for a source. Args: - items: list of dictionaries, each item has audio_filepath, offset, and duration - sample_rate: desired sample rate of the signal - total_len: total length in samples + signal_type: 'point' or 'diffuse' + sample_rate: Sampling rate for the signal + audio_data: List of audio items, each is a dictionary with audio_filepath, duration, offset and optionally text + audio_dir: Base directory for resolving paths, e.g., manifest basedir + min_duration: Minimal duration to be loaded if ref_signal is not provided, in seconds + ref_signal: Optional, used to determine the length of the signal + mic_positions: Optional, used to prepare approximately diffuse signal + num_retries: Number of retries when selecting the source files Returns: - Numpy array, shape (total_len, num_channels) + (audio_signal, metadata), where audio_signal is an ndarray and metadata is a dictionary + with audio filepaths, durations and offsets """ - if items is None: - # Nothing is provided + if not signal_type in ['point', 'diffuse']: + raise ValueError(f'Unexpected signal type {signal_type}.') + + if audio_data is None: + # No data to load return None - signal = None - samples_to_load = total_len - # if necessary, load multiple from files - for item in items: - check_min_sample_rate(item['audio_filepath'], sample_rate) - # load the pre-defined segment - segment = AudioSegment.from_file( - audio_file=item['audio_filepath'], target_sr=sample_rate, offset=item['offset'], duration=item['duration'], - ) - # not perfect, since different files may have different distributions - segment_samples = normalize_max(segment.samples) - # concatenate - signal = np.concatenate((signal, segment_samples)) if signal is not None else segment_samples - # remaining samples - samples_to_load -= len(segment_samples) + metadata = {} + + if ref_signal is None: + audio_signal = None + # load at least one sample if min_duration is not provided + samples_to_load = int(min_duration * sample_rate) if min_duration is not None else 1 + source_signals_metadata = {'audio_filepath': [], 'duration': [], 'offset': [], 'text': []} + + while samples_to_load > 0: + # Select a random item and load the audio + item = random.choice(audio_data) + + audio_filepath = item['audio_filepath'] + if not os.path.isabs(audio_filepath) and audio_dir is not None: + audio_filepath = os.path.join(audio_dir, audio_filepath) + + # Load audio + check_min_sample_rate(audio_filepath, sample_rate) + audio_segment = AudioSegment.from_file( + audio_file=audio_filepath, + target_sr=sample_rate, + duration=item['duration'], + offset=item.get('offset', 0), + ) + + if signal_type == 'point': + if audio_segment.num_channels > 1: + raise RuntimeError( + f'Expecting single-channel source signal, but received {audio_segment.num_channels}. File: {audio_filepath}' + ) + else: + raise ValueError(f'Unexpected signal type {signal_type}.') + + source_signals_metadata['audio_filepath'].append(audio_filepath) + source_signals_metadata['duration'].append(item['duration']) + source_signals_metadata['duration'].append(item.get('offset', 0)) + source_signals_metadata['text'].append(item.get('text')) + + # not perfect, since different files may have different distributions + segment_samples = normalize_max(audio_segment.samples) + # concatenate + audio_signal = ( + np.concatenate((audio_signal, segment_samples)) if audio_signal is not None else segment_samples + ) + # remaining samples + samples_to_load -= len(segment_samples) + + # Finally, we need only the metadata for the complete signal + metadata = { + 'duration': sum(source_signals_metadata['duration']), + 'offset': 0, + } + + # Add text only if all source signals have text + if all([isinstance(tt, str) for tt in source_signals_metadata['text']]): + metadata['text'] = ' '.join(source_signals_metadata['text']) + else: + # Load a signal with total_len samples and ensure it has enough simultaneous activity/overlap with ref_signal + # Concatenate multiple files if necessary + total_len = len(ref_signal) + + for n in range(num_retries): + + audio_signal = None + source_signals_metadata = {'audio_filepath': [], 'duration': [], 'offset': []} + + if signal_type == 'point': + samples_to_load = total_len + elif signal_type == 'diffuse': + # Load longer signal so it can be reshaped into (samples, mics) and + # used to generate approximately diffuse noise field + num_mics = len(mic_positions) + samples_to_load = num_mics * total_len + + while samples_to_load > 0: + # Select an audio file + item = random.choice(audio_data) + + audio_filepath = item['audio_filepath'] + if not os.path.isabs(audio_filepath) and audio_dir is not None: + audio_filepath = os.path.join(audio_dir, audio_filepath) + + # Load audio signal + check_min_sample_rate(audio_filepath, sample_rate) + + if (max_offset := item['duration'] - np.ceil(samples_to_load / sample_rate)) > 0: + # Load with a random offset if the example is longer than samples_to_load + offset = random.uniform(0, max_offset) + duration = -1 + else: + # Load the whole file + offset, duration = 0, item['duration'] + audio_segment = AudioSegment.from_file( + audio_file=audio_filepath, target_sr=sample_rate, duration=duration, offset=offset + ) + + # Prepare a single-channel signal + if audio_segment.num_channels == 1: + # Take all samples + segment_samples = audio_segment.samples + else: + # Take a random channel + selected_channel = random.choice(range(audio_segment.num_channels)) + segment_samples = audio_segment.samples[:, selected_channel] + + source_signals_metadata['audio_filepath'].append(audio_filepath) + source_signals_metadata['duration'].append(len(segment_samples) / sample_rate) + source_signals_metadata['offset'].append(offset) + + # not perfect, since different files may have different distributions + segment_samples = normalize_max(segment_samples) + # concatenate + audio_signal = ( + np.concatenate((audio_signal, segment_samples)) if audio_signal is not None else segment_samples + ) + # remaining samples + samples_to_load -= len(segment_samples) + + if signal_type == 'diffuse' and num_mics > 1: + try: + # Trim and reshape to num_mics to prepare num_mics source signals + audio_signal = audio_signal[: num_mics * total_len].reshape(num_mics, -1).T + + # Make spherically diffuse noise + audio_signal = generate_approximate_noise_field( + mic_positions=np.array(mic_positions), noise_signal=audio_signal, sample_rate=sample_rate + ) + except Exception as e: + logging.info('Failed to generate approximate noise field: %s', str(e)) + logging.info('Try again.') + # Try again + audio_signal, source_signals_metadata = None, {} + continue + + # Trim to length + audio_signal = audio_signal[:total_len, ...] + + # Include the channel dimension if the reference includes it + if ref_signal.ndim == 2 and audio_signal.ndim == 1: + audio_signal = audio_signal[:, None] + + try: + # Signal and ref_signal should be simultaneously active + simultaneously_active_rms(ref_signal, audio_signal, sample_rate=sample_rate) + # We have enough overlap + break + except Exception as e: + # Signal and ref_signal are not overlapping, try again + logging.info('Exception: %s', str(e)) + logging.info('Signals are not overlapping, try again.') + audio_signal, source_signals_metadata = None, {} + continue - if samples_to_load <= 0: - break - # trim to length - signal = signal[:total_len, ...] + if audio_signal is None: + logging.warning('Audio signal not set: %s.', signal_type) - return signal + metadata['source_signals'] = source_signals_metadata + + return audio_signal, metadata def check_min_sample_rate(filepath: str, sample_rate: float): @@ -3479,9 +3662,9 @@ def check_min_sample_rate(filepath: str, sample_rate: float): def simulate_room_mix( sample_rate: int, target_cfg: dict, - noise_cfg: List[dict], interference_cfg: dict, mix_cfg: dict, + audio_metadata: dict, base_output_filepath: str, max_amplitude: float = 0.999, eps: float = 1e-16, @@ -3499,6 +3682,7 @@ def simulate_room_mix( index mix_cfg: Dictionary with the mixture configuration. Includes RSNR, RSIR, ref_mic and ref_mic_rms. + audio_metadata: Dictionary with a list of files for target, noise and interference base_output_filepath: All output audio files will be saved with this prefix by adding a diffierent suffix for each component, e.g., _mic.wav. max_amplitude: Maximum amplitude of the mic signal, used to prevent clipping. @@ -3510,7 +3694,9 @@ def simulate_room_mix( output manifest file. """ # Local utilities - def load_rir(room_filepath: str, source: int, sample_rate: float, rir_key: str = 'rir') -> np.ndarray: + def load_rir( + room_filepath: str, source: int, selected_mics: list, sample_rate: float, rir_key: str = 'rir' + ) -> np.ndarray: """Load a RIR and check that the sample rate is matching the desired sample rate Args: @@ -3527,31 +3713,84 @@ def load_rir(room_filepath: str, source: int, sample_rate: float, rir_key: str = raise RuntimeError( f'RIR sample rate ({sample_rate}) is not matching the expected sample rate ({sample_rate}). File: {room_filepath}' ) - return rir + return rir[:, selected_mics] + + def get_early_rir( + rir: np.ndarray, rir_anechoic: np.ndarray, sample_rate: int, early_duration: float = 0.050 + ) -> np.ndarray: + """Return only the early part of the RIR. + """ + early_len = int(early_duration * sample_rate) + direct_path_delay = np.min(np.argmax(rir_anechoic, axis=0)) + rir_early = rir.copy() + rir_early[direct_path_delay + early_len :, :] = 0 + return rir_early + + def save_audio( + base_path: str, + tag: str, + audio_signal: Optional[np.ndarray], + sample_rate: int, + save: str = 'all', + ref_mic: Optional[int] = None, + format: str = 'wav', + subtype: str = 'float', + ): + """Save audio signal and return filepath. + """ + if (audio_signal is None) or (not save): + return None + + if save == 'ref_mic': + # save only ref_mic + audio_signal = audio_signal[:, ref_mic] + + audio_filepath = base_path + f'_{tag}.{format}' + sf.write(audio_filepath, audio_signal, sample_rate, subtype) + + return audio_filepath # Target RIRs - target_rir = load_rir(target_cfg['room_filepath'], source=target_cfg['source'], sample_rate=sample_rate) + target_rir = load_rir( + target_cfg['room_filepath'], + source=target_cfg['source'], + selected_mics=target_cfg['selected_mics'], + sample_rate=sample_rate, + ) target_rir_anechoic = load_rir( - target_cfg['room_filepath'], source=target_cfg['source'], sample_rate=sample_rate, rir_key='anechoic' + target_cfg['room_filepath'], + source=target_cfg['source'], + sample_rate=sample_rate, + selected_mics=target_cfg['selected_mics'], + rir_key='anechoic', ) + target_rir_early = get_early_rir(rir=target_rir, rir_anechoic=target_rir_anechoic, sample_rate=sample_rate) # Target signals - check_min_sample_rate(target_cfg['audio_filepath'], sample_rate) - target_segment = AudioSegment.from_file( - audio_file=target_cfg['audio_filepath'], target_sr=sample_rate, duration=target_cfg['duration'] + target_signal, target_metadata = prepare_source_signal( + signal_type='point', + sample_rate=sample_rate, + audio_data=audio_metadata['target'], + audio_dir=audio_metadata['target_dir'], + min_duration=mix_cfg['min_duration'], ) - if target_segment.num_channels > 1: - raise RuntimeError( - f'Expecting single-channel source signal, but received {target_segment.num_channels}. File: {target_cfg["audio_filepath"]}' - ) - target_signal = normalize_max(target_segment.samples) + source_signals_metadata = {'target': target_metadata['source_signals']} - # Convolve + # Convolve target target_reverberant = convolve_rir(target_signal, target_rir) target_anechoic = convolve_rir(target_signal, target_rir_anechoic) + target_early = convolve_rir(target_signal, target_rir_early) # Prepare noise signal - noise = load_audio_from_multiple_files(noise_cfg, sample_rate=sample_rate, total_len=len(target_reverberant)) + noise, noise_metadata = prepare_source_signal( + signal_type='diffuse', + sample_rate=sample_rate, + mic_positions=target_cfg['mic_positions'], + audio_data=audio_metadata['noise'], + audio_dir=audio_metadata['noise_dir'], + ref_signal=target_reverberant, + ) + source_signals_metadata['noise'] = noise_metadata['source_signals'] # Prepare interference signal if interference_cfg is None: @@ -3559,20 +3798,31 @@ def load_rir(room_filepath: str, source: int, sample_rate: float, rir_key: str = else: # Load interference signals interference = 0 + source_signals_metadata['interference'] = [] for i_cfg in interference_cfg: - # Load signal - i_signal = load_audio_from_multiple_files( - i_cfg['audio'], sample_rate=sample_rate, total_len=len(target_reverberant) + # Load single-channel signal for directional interference + i_signal, i_metadata = prepare_source_signal( + signal_type='point', + sample_rate=sample_rate, + audio_data=audio_metadata['interference'], + audio_dir=audio_metadata['interference_dir'], + ref_signal=target_signal, ) + source_signals_metadata['interference'].append(i_metadata['source_signals']) # Load RIR from the same room as the target, but a difference source - i_rir = load_rir(target_cfg['room_filepath'], source=i_cfg['source'], sample_rate=sample_rate) - # Convolve + i_rir = load_rir( + target_cfg['room_filepath'], + source=i_cfg['source'], + selected_mics=i_cfg['selected_mics'], + sample_rate=sample_rate, + ) + # Convolve interference i_reverberant = convolve_rir(i_signal, i_rir) # Sum interference += i_reverberant # Scale and add components of the signal - mix = target_reverberant.copy() + mic = target_reverberant.copy() if noise is not None: noise = scaled_disturbance( @@ -3583,7 +3833,7 @@ def load_rir(room_filepath: str, source: int, sample_rate: float, rir_key: str = ref_channel=mix_cfg['ref_mic'], ) # Update mic signal - mix += noise + mic += noise if interference is not None: interference = scaled_disturbance( @@ -3594,13 +3844,13 @@ def load_rir(room_filepath: str, source: int, sample_rate: float, rir_key: str = ref_channel=mix_cfg['ref_mic'], ) # Update mic signal - mix += interference + mic += interference # Set the final mic signal level - mix_rms = rms(mix[:, mix_cfg['ref_mic']]) - global_gain = db2mag(mix_cfg['ref_mic_rms']) / (mix_rms + eps) - mix_max = np.max(np.abs(mix)) - if (clipped_max := mix_max * global_gain) > max_amplitude: + mic_rms = rms(mic[:, mix_cfg['ref_mic']]) + global_gain = db2mag(mix_cfg['ref_mic_rms']) / (mic_rms + eps) + mic_max = np.max(np.abs(mic)) + if (clipped_max := mic_max * global_gain) > max_amplitude: # Downscale the global gain to prevent clipping + adjust ref_mic_rms accordingly clipping_prevention_gain = max_amplitude / clipped_max global_gain *= clipping_prevention_gain @@ -3612,76 +3862,71 @@ def load_rir(room_filepath: str, source: int, sample_rate: float, rir_key: str = mag2db(clipping_prevention_gain), ) - # scale all signal components - mix *= global_gain - target_reverberant *= global_gain - target_anechoic *= global_gain - if noise is not None: - noise *= global_gain - if interference is not None: - interference *= global_gain - # save signals - mic_filepath = base_output_filepath + '_mic.wav' - sf.write(mic_filepath, mix, sample_rate, 'float') + signals = { + 'mic': mic, + 'target_reverberant': target_reverberant, + 'target_anechoic': target_anechoic, + 'target_early': target_early, + 'noise': noise, + 'interference': interference, + } - target_reverberant_filepath = base_output_filepath + '_target_reverberant.wav' - sf.write(target_reverberant_filepath, target_reverberant, sample_rate, 'float') + metadata = {} - target_anechoic_filepath = base_output_filepath + '_target_anechoic.wav' - sf.write(target_anechoic_filepath, target_anechoic, sample_rate, 'float') + for tag, signal in signals.items(): - if noise is not None: - noise_filepath = base_output_filepath + '_noise.wav' - sf.write(noise_filepath, noise, sample_rate, 'float') - else: - noise_filepath = None + if signal is not None: + # scale all signal components with the global gain + signal = global_gain * signal - if interference is not None: - interference_filepath = base_output_filepath + '_interference.wav' - sf.write(interference_filepath, interference, sample_rate, 'float') - else: - interference_filepath = None - - # calculate DRR - direct_path_delay = np.argmax(target_rir_anechoic, axis=0) - drr = calculate_drr(target_rir, sample_rate, direct_path_delay) + audio_filepath = save_audio( + base_path=base_output_filepath, + tag=tag, + audio_signal=signal, + sample_rate=sample_rate, + save=mix_cfg['save'].get(tag, 'all'), + ref_mic=mix_cfg['ref_mic'], + format=mix_cfg['save'].get('format', 'wav'), + subtype=mix_cfg['save'].get('subtype', 'float'), + ) - metadata = { - 'audio_filepath': mic_filepath, - 'target_reverberant_filepath': target_reverberant_filepath, - 'target_anechoic_filepath': target_anechoic_filepath, - 'noise_filepath': noise_filepath, - 'interference_filepath': interference_filepath, - 'text': target_cfg.get('text'), - 'duration': target_cfg['duration'], - 'target_cfg': target_cfg, - 'noise_cfg': noise_cfg, - 'interference_cfg': interference_cfg, - 'mix_cfg': mix_cfg, - 'rt60': target_cfg.get('rt60'), - 'drr': drr, - 'rsnr': None if noise_cfg is None else mix_cfg['rsnr'], - 'rsir': None if interference_cfg is None else mix_cfg['rsir'], - } + if tag == 'mic': + metadata['audio_filepath'] = audio_filepath + else: + metadata[tag + '_filepath'] = audio_filepath + + # Add metadata + metadata.update( + { + 'text': target_metadata.get('text'), + 'duration': target_metadata['duration'], + 'target_cfg': target_cfg, + 'interference_cfg': interference_cfg, + 'mix_cfg': mix_cfg, + 'ref_channel': mix_cfg.get('ref_mic'), + 'rt60': target_cfg.get('rt60'), + 'drr': calculate_drr(target_rir, sample_rate, n_direct=np.argmax(target_rir_anechoic, axis=0)), + 'rsnr': None if noise is None else mix_cfg['rsnr'], + 'rsir': None if interference is None else mix_cfg['rsir'], + 'source_signals': source_signals_metadata, + } + ) return convert_numpy_to_serializable(metadata) -def simulate_room_mix_kwargs(kwargs: dict) -> dict: - """Wrapper around `simulate_room_mix` to handle kwargs. - - `pool.map(simulate_room_kwargs, examples)` would be - equivalent to `pool.starstarmap(simulate_room_mix, examples)` - if `starstarmap` would exist. +def simulate_room_mix_helper(example_and_audio_metadata: tuple) -> dict: + """Wrapper around `simulate_room_mix` for pool.imap. Args: - kwargs: kwargs that are forwarded to `simulate_room_mix` + args: example and audio_metadata that are forwarded to `simulate_room_mix` Returns: Dictionary with metadata, see `simulate_room_mix` """ - return simulate_room_mix(**kwargs) + example, audio_metadata = example_and_audio_metadata + return simulate_room_mix(**example, audio_metadata=audio_metadata) def plot_mix_manifest_info(filepath: str, plot_filepath: str = None): @@ -3720,8 +3965,11 @@ def plot_mix_manifest_info(filepath: str, plot_filepath: str = None): drr += data['drr'] # average DRR across all mics # noise - rsnr.append(data['rsnr']) - rsir.append(data['rsir']) + if data['rsnr'] is not None: + rsnr.append(data['rsnr']) + + if data['rsir'] is not None: + rsir.append(data['rsir']) # plot plt.figure(figsize=(12, 6)) @@ -3760,21 +4008,21 @@ def plot_mix_manifest_info(filepath: str, plot_filepath: str = None): plt.hist(drr, label='DRR') plt.xlabel('DRR / dB') plt.ylabel('# examples') - plt.title('DRR (average over mics)') + plt.title('DRR [avg over mics]') - if not any([val is None for val in rsnr]): + if len(rsnr) > 0: plt.subplot(2, 4, 7) plt.hist(rsnr, label='RSNR') plt.xlabel('RSNR / dB') plt.ylabel('# examples') - plt.title('RSNR') + plt.title(f'RSNR [{100 * len(rsnr) / len(rt60):.0f}% ex]') - if not any([val is None for val in rsir]): + if len(rsir): plt.subplot(2, 4, 8) plt.hist(rsir, label='RSIR') plt.xlabel('RSIR / dB') plt.ylabel('# examples') - plt.title('RSIR') + plt.title(f'RSIR [{100 * len(rsir) / len(rt60):.0f}% ex]') for n in range(8): plt.subplot(2, 4, n + 1) diff --git a/nemo/collections/asr/parts/utils/audio_utils.py b/nemo/collections/asr/parts/utils/audio_utils.py index de67fc00c73a..80dfc74950a5 100644 --- a/nemo/collections/asr/parts/utils/audio_utils.py +++ b/nemo/collections/asr/parts/utils/audio_utils.py @@ -251,7 +251,7 @@ def transform_to_match_coherence( desired_coherence: npt.NDArray, method: str = 'cholesky', ref_channel: int = 0, - corrcoef_threshold: float = 0.05, + corrcoef_threshold: float = 0.2, ) -> npt.NDArray: """Transform the input multichannel signal to match the desired coherence. @@ -291,9 +291,9 @@ def transform_to_match_coherence( corrcoef_matrix = np.corrcoef(signal.transpose()) # mask the diagonal elements np.fill_diagonal(corrcoef_matrix, 0.0) - if np.any(corrcoef_matrix > corrcoef_threshold): + if np.any(np.abs(corrcoef_matrix) > corrcoef_threshold): raise RuntimeError( - f'Input channels are correlated above the threshold {corrcoef_threshold}. Off-diagonal elements of the coefficient matrix: {str(corrcoef_matrix)}.' + f'Input channels are correlated above the threshold {corrcoef_threshold}. Max abs off-diagonal element of the coefficient matrix: {np.abs(corrcoef_matrix).max()}.' ) # analysis transform @@ -324,7 +324,7 @@ def transform_to_match_coherence( # synthesis transform # transpose X from (subband, frame, channel) to (channel, subband, frame) - x = librosa.istft(X.transpose(2, 0, 1)) + x = librosa.istft(X.transpose(2, 0, 1), length=len(signal)) # (channel, sample) -> (sample, channel) x = x.transpose() diff --git a/tests/collections/asr/test_asr_data_simulation.py b/tests/collections/asr/test_asr_data_simulation.py index c3b6f3c3a535..3cddf44f7657 100644 --- a/tests/collections/asr/test_asr_data_simulation.py +++ b/tests/collections/asr/test_asr_data_simulation.py @@ -344,32 +344,40 @@ def test_simulate_room_mix(self, test_data_dir): sample_rate = 16000 target_cfg = { 'room_filepath': os.path.join(data_dir, 'test_room.h5'), + 'mic_positions': np.random.rand(6, 3), # random positions + 'selected_mics': [0, 1, 2, 3, 4, 5], 'source': 0, 'audio_filepath': os.path.join(data_dir, 'target.wav'), 'duration': 1.5, } - noise_cfg = [{'audio_filepath': os.path.join(data_dir, 'noise.wav'), 'offset': 0.8, 'duration': 1.5,}] - interference_cfg = [ - { - 'source': 1, - 'audio': [ - {'audio_filepath': os.path.join(data_dir, 'interference_1.wav'), 'offset': 0.0, 'duration': 0.8}, - { - 'audio_filepath': os.path.join(data_dir, 'interference_2.wav'), - 'offset': 0.05, - 'duration': 0.701, - }, - ], - } - ] + interference_cfg = [{'source': 1, 'selected_mics': target_cfg['selected_mics']}] + + audio_metadata = { + 'target': [{'audio_filepath': 'target.wav', 'duration': 1.5, 'offset': 0.8}], + 'target_dir': data_dir, + 'noise': [{'audio_filepath': 'noise.wav', 'duration': 2.3}], + 'noise_dir': data_dir, + 'interference': [ + {'audio_filepath': 'interference_1.wav', 'duration': 0.8}, + {'audio_filepath': 'interference_2.wav', 'duration': 0.75}, + ], + 'interference_dir': data_dir, + } - mix_cfg = {'rsnr': 10, 'rsir': 15, 'ref_mic': 0, 'ref_mic_rms': -30} + mix_cfg = {'rsnr': 10, 'rsir': 15, 'ref_mic': 0, 'ref_mic_rms': -30, 'min_duration': None, 'save': {}} with tempfile.TemporaryDirectory() as output_dir: # Mix base_output_filepath = os.path.join(output_dir, 'test_output') - simulate_room_mix(sample_rate, target_cfg, noise_cfg, interference_cfg, mix_cfg, base_output_filepath) + simulate_room_mix( + sample_rate=sample_rate, + target_cfg=target_cfg, + interference_cfg=interference_cfg, + mix_cfg=mix_cfg, + audio_metadata=audio_metadata, + base_output_filepath=base_output_filepath, + ) # Check target + noise + interference = mix mix_from_parts = 0 diff --git a/tests/collections/asr/utils/test_audio_utils.py b/tests/collections/asr/utils/test_audio_utils.py index 00976d1361f5..c3fcc9d00c22 100644 --- a/tests/collections/asr/utils/test_audio_utils.py +++ b/tests/collections/asr/utils/test_audio_utils.py @@ -384,18 +384,19 @@ def test_generate_approximate_noise_field( mic_positions[:, 0] = mic_spacing * np.arange(num_mics) # UUT - noise_field = generate_approximate_noise_field(mic_positions, noise_signal, sample_rate, fft_length=fft_length) + noise_field = generate_approximate_noise_field( + mic_positions, noise_signal, sample_rate=sample_rate, field=field, fft_length=fft_length + ) # Compare the estimated coherence with the theoretical coherence - analysis_fft_length = 256 # reference golden_coherence = theoretical_coherence( - mic_positions, sample_rate=sample_rate, fft_length=analysis_fft_length + mic_positions, sample_rate=sample_rate, field=field, fft_length=fft_length ) # estimated - N = librosa.stft(noise_field.transpose(), n_fft=analysis_fft_length) + N = librosa.stft(noise_field.transpose(), n_fft=fft_length) # (channel, subband, frame) -> (subband, frame, channel) N = N.transpose(1, 2, 0) uut_coherence = estimated_coherence(N) @@ -412,7 +413,7 @@ def test_generate_approximate_noise_field( if not os.path.exists(figure_dir): os.mkdir(figure_dir) - freq = librosa.fft_frequencies(sr=sample_rate, n_fft=analysis_fft_length) + freq = librosa.fft_frequencies(sr=sample_rate, n_fft=fft_length) freq = freq / 1e3 # kHz plt.figure(figsize=(7, 10))