Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/multi file inference #96

Merged
merged 18 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions scripts/merge_multiple_ravens_to_one_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pandas as pd
from pathlib import Path
import argparse
import soundfile as sf
from tqdm import tqdm


def make_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--input-raven-folder", "-ir",
help="Path to the directory containing the raven files to be merged.", type=str)
parser.add_argument("--input-audio-folder", "-ia",
help="Path to the directory containing the audio files that are aligned to the annotations.",
type=str)
parser.add_argument("--output-path", "-o",
help="Path the the output path of the merged raven annotation", type=str)
parser.add_argument("--include-begin-file", "-ibf", dest="include_begin_file", action="store_true")
parser.add_argument("--no-begin-file", "-nbf", dest="include_begin_file", action="store_false")
parser.set_defaults(include_begin_file=True)
return parser


def main() -> None:
"""
This script is used to merge multiple raven annotation files into one file.
"""
# configurations:
args = make_parser().parse_args()
raven_folder = Path(args.input_raven_folder)
audio_folder = Path(args.input_audio_folder)
output_path = Path(args.output_path)
include_begin_file = args.include_begin_file
# get the list of raven files
raven_files = list(raven_folder.glob('*.txt'))
# get the list of audio files
audio_files = list(audio_folder.glob('*.wav'))
# sort the audio files by name, should be the order by start time as well
audio_files = sorted(audio_files)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might be nice to allow the user to define the key to sort by (sort by name can be the default)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not gonna implement it now, if needed in the future we can have a user defined key ordering

assert len(raven_files) == len(audio_files), "The number of raven files and audio files should be the same."
# create a mapping between the raven files and the audio files
adapted_files_list = []
for file in tqdm(audio_files, desc="Mapping audio files to raven files"):
file_stem = file.stem
adapted_raven_files = [raven_file for raven_file in raven_files if file_stem in raven_file.stem]
assert len(adapted_raven_files) == 1, f"Expected one raven file for {file_stem}, found {len(adapted_raven_files)}"
raven_file = adapted_raven_files[0]
res = {"name": file_stem, "audio_file": file, "raven_file": raven_file}
adapted_files_list.append(res)
# create a list to store the dataframes
df_list = []
# iterate over the raven files
seconds_offset = 0
entries_offset = 0
for entry in tqdm(adapted_files_list, desc="Merging raven files"):
# read the raven file
df = pd.read_csv(entry["raven_file"], sep="\t")
# add the offset to the begin and end time
df['Begin Time (s)'] += seconds_offset
df['End Time (s)'] += seconds_offset
df['Selection'] += entries_offset
mosheman5 marked this conversation as resolved.
Show resolved Hide resolved
if include_begin_file:
df['Begin File'] = [entry["audio_file"].name] * df.shape[0]
# get the audio file duration
audio_file_duration = sf.info(entry["audio_file"]).duration
# add the audio file duration to the offset
seconds_offset += audio_file_duration
entries_offset += df.shape[0]
# add the dataframe to the list
df_list.append(df)

# concatenate the dataframes
concatenated_df = pd.concat(df_list)
# save the concatenated dataframe
concatenated_df.to_csv(output_path, sep="\t", index=False)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions soundbay/conf/experiment/inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ experiment:
load_optimizer_state: False
save_raven: False
threshold: 0.5
raven_max_freq: null
hydra:
run:
dir: .null
Expand Down
64 changes: 42 additions & 22 deletions soundbay/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,39 +489,61 @@ def __init__(self, file_path: Union[str, Path],
preprocessors: DictConfig,
seq_length: float = 1,
data_sample_rate: int = 44100,
sample_rate: int = 44100,
channel: int = None):
sample_rate: int = 44100):
"""
__init__ method initiates InferenceDataset instance:
Input:

Output:
InferenceDataset Object - inherits from Dataset object in PyTorch package
"""
self.file_path = file_path
self.file_path = Path(file_path)
self.metadata_path = self.file_path # alias to support inference pipeline
self.seq_length = seq_length
self.sample_rate = sample_rate
self.data_sample_rate = data_sample_rate
self.sampler = torchaudio.transforms.Resample(orig_freq=data_sample_rate, new_freq=sample_rate)
self.preprocessor = ClassifierDataset.set_preprocessor(preprocessors)
self.channel = channel
self._create_start_times()
self.metadata = self._create_inference_metadata()

def _create_start_times(self):
def _create_inference_metadata(self) -> pd.DataFrame:
"""
create metadata to be used in the inference dataset
in case we have a directory, we will iterate over all files in the directory
and create metadata for each file and merge it together
For a single file, we will create metadata for that file
"""
all_data_frames = []
if self.file_path.is_dir():
all_files = [self.file_path / x for x in self.file_path.iterdir()]
else:
all_files = [self.file_path]
for file in all_files:
if file.suffix not in ['.wav', '.WAV']:
raise ValueError(f'InferenceDataset only supports .wav files, got {file.suffix}')
file_start_time = self._create_start_times(file)
for channel_num in range(sf.info(file).channels):
metadata = pd.DataFrame({'filename': [file] * len(file_start_time),
'channel': [channel_num] * len(file_start_time),
'begin_time': file_start_time,
'end_time': file_start_time + self.seq_length})
all_data_frames.append(metadata)
metadata = pd.concat(all_data_frames, ignore_index=True)
return metadata

def _create_start_times(self, filepath: Path) -> np.ndarray:
"""
create reference dict to extract audio files from metadata annotation
Input:
data_path - Path object
Output:
audio_dict contains references to audio paths given name from metadata
"""
audio_len = sf.info(self.file_path).duration
audio_len = sf.info(filepath).duration
decimal_place = abs(Decimal(str(self.seq_length)).as_tuple().exponent)
self._start_times = np.arange(0, round(audio_len//self.seq_length * self.seq_length, decimal_place),
self.seq_length)
return np.arange(0, round(audio_len//self.seq_length * self.seq_length, decimal_place), self.seq_length)

def _get_audio(self, begin_time):
def _get_audio(self, filepath: Path, channel: int, begin_time: float) -> torch.Tensor:
"""
_get_audio gets a path_to_file from _grab_fields method and also begin_time and end_time
and returns the audio segment in a torch.tensor
Expand All @@ -534,35 +556,33 @@ def _get_audio(self, begin_time):
output:
audio - pytorch tensor (1-D array)
"""
duration = sf.info(self.file_path).duration
duration = sf.info(filepath).duration
begin_time = int(begin_time * self.data_sample_rate)
stop_time = begin_time + int(self.seq_length * self.data_sample_rate)
assert duration * self.data_sample_rate >= stop_time, f"trying to load audio from {begin_time} to {stop_time} but audio is only {duration} long"
data, orig_sample_rate = sf.read(self.file_path, start=begin_time, stop=stop_time)
num_channels = sf.info(self.file_path).channels
if (self.channel is not None) and (num_channels > 1):
data = data[:, self.channel]
data, orig_sample_rate = sf.read(filepath, start=begin_time, stop=stop_time, always_2d=True)
data = data[:, channel]
assert orig_sample_rate == self.data_sample_rate, \
f'sample rate is {orig_sample_rate}, should be {self.data_sample_rate}'
audio = torch.tensor(data, dtype=torch.float).unsqueeze(0)
return audio

def __getitem__(self, idx):
def __getitem__(self, idx: int):
'''
__getitem__ method loads item according to idx from the metadata
__getitem__ method loads item according to idx from the metadata.

input:
idx - int

output:
audio, label - torch tensor (1-d if no spectrogram is applied/ 2-d if applied a spectrogram
, int (if mode="train" only)
audio - torch tensor (1-d if no spectrogram is applied/ 2-d if applied a spectrogram
'''
begin_time = int(self._start_times[idx] * self.data_sample_rate)
audio = self._get_audio(begin_time)
filepath, channel, begin_time = self.metadata.loc[idx, ['filename', 'channel', 'begin_time']]
audio = self._get_audio(filepath=filepath, channel=channel, begin_time=begin_time)
audio = self.sampler(audio)
audio = self.preprocessor(audio)

return audio

def __len__(self):
return len(self._start_times)
return len(self.metadata)
Loading
Loading