diff --git a/.travis.yml b/.travis.yml index 4dc7ed72d6c..47c7d0b65aa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,12 +2,18 @@ # one using CMake, and one using make. env: matrix: - - WITH_CUDA=false WITH_CMAKE=false WITH_IO=true - - WITH_CUDA=false WITH_CMAKE=true WITH_IO=true PYTHON_VERSION=3 - - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true - - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true - - WITH_CUDA=false WITH_CMAKE=false WITH_IO=false - - WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3 + - WITH_CUDA=false WITH_CMAKE=false WITH_IO=true WITH_AUDIO=false + - WITH_CUDA=false WITH_CMAKE=false WITH_IO=true WITH_AUDIO=true + - WITH_CUDA=false WITH_CMAKE=true WITH_IO=true PYTHON_VERSION=3 WITH_AUDIO=false + - WITH_CUDA=false WITH_CMAKE=true WITH_IO=true PYTHON_VERSION=3 WITH_AUDIO=true + - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true WITH_AUDIO=false + - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true WITH_AUDIO=true + - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true WITH_AUDIO=false + - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true WITH_AUDIO=true + - WITH_CUDA=false WITH_CMAKE=false WITH_IO=false WITH_AUDIO=false + - WITH_CUDA=false WITH_CMAKE=false WITH_IO=false WITH_AUDIO=true + - WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3 WITH_AUDIO=false + - WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3 WITH_AUDIO=true language: cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 37f937fe489..c3840d0e5f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,7 @@ caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON) caffe_option(USE_LMDB "Build with lmdb" ON) caffe_option(USE_LEVELDB "Build with levelDB" ON) caffe_option(USE_OPENCV "Build with OpenCV support" ON) +caffe_option(USE_AUDIO "Build with audio data support" OFF) # ---[ Dependencies include(cmake/Dependencies.cmake) diff --git a/Makefile b/Makefile index 5fb6394e947..2a59003cfed 100644 --- a/Makefile +++ b/Makefile @@ -186,6 +186,9 @@ endif ifeq ($(USE_OPENCV), 1) LIBRARIES += opencv_core opencv_highgui opencv_imgproc endif +ifeq ($(USE_AUDIO), 1) + LIBRARIES += sndfile fftw3 fftw3f +endif PYTHON_LIBRARIES := boost_python python2.7 WARNINGS := -Wall -Wno-sign-compare @@ -314,6 +317,9 @@ endif ifeq ($(USE_LMDB), 1) COMMON_FLAGS += -DUSE_LMDB endif +ifeq ($(USE_AUDIO), 1) + COMMON_FLAGS += -DUSE_AUDIO +endif # CPU-only configuration ifeq ($(CPU_ONLY), 1) diff --git a/Makefile.config.example b/Makefile.config.example index a20bad2f5ce..e1cbcb3ce54 100644 --- a/Makefile.config.example +++ b/Makefile.config.example @@ -73,6 +73,9 @@ PYTHON_LIB := /usr/lib # Uncomment to support layers written in Python (will link against Python libs) # WITH_PYTHON_LAYER := 1 +# Uncomment to support reading from audio files +# USE_AUDIO := 1 + # Whatever else you find you need goes here. INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake index 8b259965359..9f6395ae314 100644 --- a/cmake/ConfigGen.cmake +++ b/cmake/ConfigGen.cmake @@ -68,6 +68,10 @@ function(caffe_generate_export_configs) list(APPEND Caffe_DEFINITIONS -DUSE_LEVELDB) endif() + if(USE_AUDIO) + list(APPEND Caffe_DEFINITIONS -DUSE_AUDIO) + endif() + if(NOT HAVE_CUDNN) set(HAVE_CUDNN FALSE) else() diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake index ff58d31c166..42b42e543d6 100644 --- a/cmake/Cuda.cmake +++ b/cmake/Cuda.cmake @@ -189,6 +189,26 @@ function(detect_cuDNN) endfunction() +################################################################################################ + +# Include cuFFT +set(CUFFT_ROOT "" CACHE PATH "CUFTT root folder") + +find_path(CUFFT_INCLUDE cufft.h + PATHS ${CUFFT_ROOT} $ENV{CUFFT_ROOT} ${CUDA_TOOLKIT_INCLUDE} + DOC "Path to cuFTT include directory." ) + +get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) +find_library(CUFFT_LIBRARY NAMES libcufft.so # libcufft_static.a + PATHS ${CUFFT_ROOT} $ENV{CUFFT_ROOT} ${CUFFT_INCLUDE} ${__libpath_hist} + DOC "Path to cuFFT library.") + +if(CUFFT_INCLUDE AND CUFFT_LIBRARY) + include_directories(SYSTEM ${CUFFT_INCLUDE}) + list(APPEND Caffe_LINKER_LIBS ${CUFFT_LIBRARY}) + message(WARNING "Found cuFFT (include: ${CUFFT_INCLUDE}, library: ${CUFFT_LIBRARY})") +endif() + ################################################################################################ ### Non macro section ################################################################################################ diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index d68d7bfba66..e7f5f1407b7 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -51,6 +51,25 @@ if(USE_LEVELDB) list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES}) endif() + +if(USE_AUDIO) +# ---[ SNDFILE + find_package(SNDFILE REQUIRED) + if(SNDFILE_FOUND) + include_directories(SYSTEM ${SNDFILE_INCLUDE_DIR}) + list(APPEND Caffe_LINKER_LIBS ${SNDFILE_LIBRARIES}) + endif() + +# ---[ FFT + find_package(FFTW QUIET) + if(FFTW_FOUND) + include_directories(SYSTEM ${FFTW_INCLUDE_DIR}) + list(APPEND Caffe_LINKER_LIBS ${FFTW_LIBRARIES}) + endif() + + add_definitions(-DUSE_AUDIO) +endif() + # ---[ CUDA include(cmake/Cuda.cmake) if(NOT HAVE_CUDA) @@ -110,18 +129,18 @@ if(BUILD_python) find_package(NumPy 1.7.1) # Find the matching boost python implementation set(version ${PYTHONLIBS_VERSION_STRING}) - + STRING( REPLACE "." "" boost_py_version ${version} ) find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}") set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND}) - + while(NOT "${version}" STREQUAL "" AND NOT Boost_PYTHON_FOUND) STRING( REGEX REPLACE "([0-9.]+).[0-9]+" "\\1" version ${version} ) - + STRING( REPLACE "." "" boost_py_version ${version} ) find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}") set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND}) - + STRING( REGEX MATCHALL "([0-9.]+).[0-9]+" has_more_version ${version} ) if("${has_more_version}" STREQUAL "") break() diff --git a/cmake/Modules/FindFFTW.cmake b/cmake/Modules/FindFFTW.cmake new file mode 100644 index 00000000000..a0683586545 --- /dev/null +++ b/cmake/Modules/FindFFTW.cmake @@ -0,0 +1,23 @@ +# - Find fftw +# Find the native fftw includes and libraries +# +# FFTW_INCLUDE_DIR - where to find fftw3.h, etc. +# FFTW_LIBRARIES - List of libraries when using fttw. +# FFTW_FOUND - True if fftw found. + +FIND_PATH(FFTW_INCLUDE_DIR NAMES fftw3.h PATHS /opt/local/include /usr/local/include /usr/include) + +FIND_LIBRARY(FFTW_LIBRARY NAMES fftw3 PATHS /usr/lib /usr/local/lib) +FIND_LIBRARY(FFTWF_LIBRARY NAMES fftw3f PATHS /usr/lib /usr/local/lib) + +SET(FFTW_LIBRARIES ${FFTW_LIBRARY} ${FFTWF_LIBRARY}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(FFTW DEFAULT_MSG + FFTW_INCLUDE_DIR FFTW_LIBRARIES) + +if(FFTW_FOUND) + set(FFTW_LIBRARIES ${FFTW_LIBRARIES}) + message(STATUS "Found FFTW (include: ${FFTW_INCLUDE_DIR}, library: ${FFTW_LIBRARIES})") + mark_as_advanced(FFTW_INCLUDE_DIR FFTW_LIBRARIES) +endif(FFTW_FOUND) diff --git a/cmake/Modules/FindSNDFILE.cmake b/cmake/Modules/FindSNDFILE.cmake new file mode 100644 index 00000000000..55942e77b45 --- /dev/null +++ b/cmake/Modules/FindSNDFILE.cmake @@ -0,0 +1,20 @@ +# - Find sndfile +# Find the native sndfile includes and libraries +# +# SNDFILE_INCLUDE_DIR - where to find sndfile.h, etc. +# SNDFILE_LIBRARIES - List of libraries when using libsndfile. +# SNDFILE_FOUND - True if libsndfile found. + +FIND_PATH(SNDFILE_INCLUDE_DIR NAMES sndfile.h PATHS $ENV{LEVELDB_ROOT}/include /opt/local/include /usr/local/include /usr/include) + +FIND_LIBRARY(SNDFILE_LIBRARIES NAMES sndfile PATHS /usr/local/lib /usr/lib $ENV{LEVELDB_ROOT}/lib) + + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(SNDFILE DEFAULT_MSG + SNDFILE_INCLUDE_DIR SNDFILE_LIBRARIES) + +if(SNDFILE_FOUND) + message(STATUS "Found LibSndFile (include: ${SNDFILE_INCLUDE_DIR}, library: ${SNDFILE_LIBRARIES})") + mark_as_advanced(SNDFILE_INCLUDE_DIR SNDFILE_LIBRARIES) +endif() diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 3d12e81a130..77681b32b8b 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -117,6 +117,7 @@ function(caffe_print_configuration_summary) caffe_status(" USE_LMDB : ${USE_LMDB}") caffe_status(" USE_LEVELDB : ${USE_LEVELDB}") caffe_status(" USE_OPENCV : ${USE_OPENCV}") + caffe_status(" USE_AUDIO : ${USE_AUDIO}") caffe_status("") caffe_status("Dependencies:") caffe_status(" BLAS : " APPLE THEN "Yes (vecLib)" ELSE "Yes (${BLAS})") @@ -134,6 +135,10 @@ function(caffe_print_configuration_summary) if(USE_OPENCV) caffe_status(" OpenCV : Yes (ver. ${OpenCV_VERSION})") endif() + if(USE_AUDIO) + caffe_status(" libsndfile : " SNDFILE_FOUND THEN "Yes" ELSE "No") + caffe_status(" fftw : " FFTW_FOUND THEN "Yes" ELSE "No") + endif() caffe_status(" CUDA : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" ) caffe_status("") if(HAVE_CUDA) diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in index 9302022d7da..4a57c1fac00 100644 --- a/cmake/Templates/caffe_config.h.in +++ b/cmake/Templates/caffe_config.h.in @@ -35,3 +35,4 @@ #cmakedefine USE_OPENCV #cmakedefine USE_LMDB #cmakedefine USE_LEVELDB +#cmakedefine USE_AUDIO diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp index 21a27d759a8..78aa1d5d9b6 100644 --- a/include/caffe/common_layers.hpp +++ b/include/caffe/common_layers.hpp @@ -624,6 +624,47 @@ class SoftmaxLayer : public Layer { Blob scale_; }; +#ifdef USE_AUDIO +/** + * @brief Computes a spectogram of input data. + * + * TODO(dox): thorough documentation for Forward, Backward, and proto params. + */ +template +class SpectrogramLayer : public Layer { + public: + explicit SpectrogramLayer(const LayerParameter& param) + : Layer(param) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { return "Spectrogram"; } + virtual inline int ExactNumBottomBlobs() const { return 2; } + virtual inline int ExactNumTopBlobs() const { return 2; } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + NOT_IMPLEMENTED; + } + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + NOT_IMPLEMENTED; + } + + int window_size_; + int step_size_; +}; +#endif + #ifdef USE_CUDNN /** * @brief cuDNN implementation of SoftmaxLayer. diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp index 90fd0d19917..aa94429d617 100644 --- a/include/caffe/data_layers.hpp +++ b/include/caffe/data_layers.hpp @@ -256,6 +256,35 @@ class ImageDataLayer : public BasePrefetchingDataLayer { int lines_id_; }; +#ifdef USE_AUDIO +/** + * @brief Provides data to the Net from audio files. + * + * TODO(dox): thorough documentation for Forward and proto params. + */ +template +class AudioDataLayer : public BasePrefetchingDataLayer { + public: + explicit AudioDataLayer(const LayerParameter& param) + : BasePrefetchingDataLayer(param) {} + virtual ~AudioDataLayer(); + virtual void DataLayerSetUp(const vector*>& bottom, + const vector*>& top); + + virtual inline const char* type() const { return "AudioData"; } + virtual inline int ExactNumBottomBlobs() const { return 0; } + virtual inline int ExactNumTopBlobs() const { return 2; } + + protected: + shared_ptr prefetch_rng_; + virtual void ShuffleFiles(); + virtual void load_batch(Batch* batch); + + vector > lines_; + int lines_id_; +}; +#endif + /** * @brief Provides data to the Net from memory. * diff --git a/include/caffe/util/fft.hpp b/include/caffe/util/fft.hpp new file mode 100644 index 00000000000..cc0c658c96c --- /dev/null +++ b/include/caffe/util/fft.hpp @@ -0,0 +1,48 @@ +#ifdef USE_AUDIO +#ifndef CAFFE_UTIL_FFT_HPP +#define CAFFE_UTIL_FFT_HPP + +#include + +#include "caffe/proto/caffe.pb.h" + + +namespace caffe { + +template +struct FastFourierTransformPImpl; + +template +class FastFourierTransform_cpu { + public: + explicit FastFourierTransform_cpu(int packetSize); + ~FastFourierTransform_cpu(); + + int process(Dtype* input_data, Dtype* output_data, int size); + + private: + const int _log2Size; + const int _packetSize; + + boost::scoped_ptr > _pimpl; +}; + +template +class FastFourierTransform_gpu { + public: + explicit FastFourierTransform_gpu(int packetSize); + ~FastFourierTransform_gpu(); + + int process(Dtype* input_data, Dtype* output_data, int size); + + private: + const int _log2Size; + const int _packetSize; + + boost::scoped_ptr > _pimpl; +}; + +} // namespace caffe + +#endif // CAFFE_UTIL_FFT_HPP +#endif // USE_AUDIO diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp index 2cacd8e72cd..792005fd6f6 100644 --- a/include/caffe/util/math_functions.hpp +++ b/include/caffe/util/math_functions.hpp @@ -3,6 +3,7 @@ #include #include // for std::fabs and std::signbit +#include #include "glog/logging.h" @@ -43,6 +44,9 @@ inline void caffe_memset(const size_t N, const int alpha, void* X) { memset(X, alpha, N); // NOLINT(caffe/alt_fn) } +template +void caffe_cvnrm(std::complex* input, Dtype* output, int size); + template void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); diff --git a/include/caffe/util/read_audio.hpp b/include/caffe/util/read_audio.hpp new file mode 100644 index 00000000000..62a37a6cd56 --- /dev/null +++ b/include/caffe/util/read_audio.hpp @@ -0,0 +1,12 @@ +#pragma once + +#include + +namespace caffe { + +int ReadAudioFile(const std::string& filePath, float* data, int capacity, + int offset = 0); +int ReadAudioFile(const std::string& filePath, double* data, int capacity, + int offset = 0); + +} // namespace caffe diff --git a/scripts/cpp_lint.py b/scripts/cpp_lint.py index f750489f4f9..c1878fc91ef 100755 --- a/scripts/cpp_lint.py +++ b/scripts/cpp_lint.py @@ -1609,6 +1609,7 @@ def CheckCaffeDataLayerSetUp(filename, clean_lines, linenum, error): ix = line.find('DataLayer::LayerSetUp') if ix >= 0 and ( line.find('void DataLayer::LayerSetUp') != -1 or + line.find('void AudioDataLayer::LayerSetUp') != -1 or line.find('void ImageDataLayer::LayerSetUp') != -1 or line.find('void MemoryDataLayer::LayerSetUp') != -1 or line.find('void WindowDataLayer::LayerSetUp') != -1): @@ -1621,6 +1622,7 @@ def CheckCaffeDataLayerSetUp(filename, clean_lines, linenum, error): if ix >= 0 and ( line.find('void Base') == -1 and line.find('void DataLayer::DataLayerSetUp') == -1 and + line.find('void AudioDataLayer::DataLayerSetUp') == -1 and line.find('void ImageDataLayer::DataLayerSetUp') == -1 and line.find('void MemoryDataLayer::DataLayerSetUp') == -1 and line.find('void WindowDataLayer::DataLayerSetUp') == -1): diff --git a/scripts/travis/travis_build_and_test.sh b/scripts/travis/travis_build_and_test.sh index 174f1ee5a0a..2bdc1f490af 100755 --- a/scripts/travis/travis_build_and_test.sh +++ b/scripts/travis/travis_build_and_test.sh @@ -10,6 +10,7 @@ if $WITH_CMAKE; then cd build CPU_ONLY=" -DCPU_ONLY=ON" if ! $WITH_CUDA; then + echo "********* CPU ONLY **********" CPU_ONLY=" -DCPU_ONLY=OFF" fi PYTHON_ARGS="" @@ -21,7 +22,10 @@ if $WITH_CMAKE; then else IO_ARGS="-DUSE_OPENCV=OFF -DUSE_LMDB=OFF -DUSE_LEVELDB=OFF" fi - cmake -DBUILD_python=ON -DCMAKE_BUILD_TYPE=Release $CPU_ONLY $PYTHON_ARGS -DCMAKE_INCLUDE_PATH="$CONDA_DIR/include/" -DCMAKE_LIBRARY_PATH="$CONDA_DIR/lib/" $IO_ARGS .. + if $WITH_AUDIO; then + AUDIO_ARGS="-DUSE_AUDIO=1" + fi + cmake -DBUILD_python=ON -DCMAKE_BUILD_TYPE=Release $CPU_ONLY $PYTHON_ARGS -DCMAKE_INCLUDE_PATH="$CONDA_DIR/include/" -DCMAKE_LIBRARY_PATH="$CONDA_DIR/lib/" $IO_ARGS $AUDIO_ARGS .. $MAKE $MAKE pytest if ! $WITH_CUDA; then @@ -39,6 +43,9 @@ else export USE_LEVELDB=1 export USE_OPENCV=1 fi + if $WITH_AUDIO; then + export USE_AUDIO=1 + fi $MAKE all test pycaffe warn lint || true if ! $WITH_CUDA; then $MAKE runtest diff --git a/scripts/travis/travis_install.sh b/scripts/travis/travis_install.sh index d6c6e228b58..4c72c42d09b 100755 --- a/scripts/travis/travis_install.sh +++ b/scripts/travis/travis_install.sh @@ -68,7 +68,7 @@ if [ ! -d $CONDA_DIR ]; then fi chmod +x miniconda.sh ./miniconda.sh -b -p $CONDA_DIR - + conda update --yes conda conda install --yes numpy scipy matplotlib scikit-image pip # Let conda install boost (so that boost_python matches) @@ -93,3 +93,9 @@ if [ "$PYTHON_VERSION" -eq "3" ]; then else pip install protobuf fi + +# install audio dependencies +if $WITH_AUDIO; then +apt-get -y update +apt-get install libfftw3-dev libsndfile-dev +fi diff --git a/scripts/travis/travis_setup_makefile_config.sh b/scripts/travis/travis_setup_makefile_config.sh index 83aacf11fb0..ef2aab40706 100755 --- a/scripts/travis/travis_setup_makefile_config.sh +++ b/scripts/travis/travis_setup_makefile_config.sh @@ -16,6 +16,7 @@ fi sed -i -e '/USE_LMDB/d' Makefile.config sed -i -e '/USE_LEVELDB/d' Makefile.config sed -i -e '/USE_OPENCV/d' Makefile.config +sed -i -e '/USE_AUDIO/d' Makefile.config cat << 'EOF' >> Makefile.config # Travis' nvcc doesn't like newer boost versions diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt index 40e6c11f5b0..3e675c20d77 100644 --- a/src/caffe/CMakeLists.txt +++ b/src/caffe/CMakeLists.txt @@ -32,5 +32,3 @@ install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib) file(WRITE ${PROJECT_BINARY_DIR}/__init__.py) list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py) install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto) - - diff --git a/src/caffe/layers/audio_data_layer.cpp b/src/caffe/layers/audio_data_layer.cpp new file mode 100644 index 00000000000..72b03838da7 --- /dev/null +++ b/src/caffe/layers/audio_data_layer.cpp @@ -0,0 +1,156 @@ +#ifdef USE_AUDIO +#include + +#include // NOLINT(readability/streams) +#include // NOLINT(readability/streams) +#include +#include +#include + +#include "caffe/data_layers.hpp" +#include "caffe/layer.hpp" +#include "caffe/util/benchmark.hpp" +#include "caffe/util/io.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/util/read_audio.hpp" +#include "caffe/util/rng.hpp" + +namespace caffe { + +template +AudioDataLayer::~AudioDataLayer() { + this->StopInternalThread(); +} + +template +void AudioDataLayer::DataLayerSetUp(const vector*>& bottom, + const vector*>& top) { + string root_folder = this->layer_param_.audio_data_param().root_folder(); + + // Read the file with filenames and labels + const string& source = this->layer_param_.audio_data_param().source(); + LOG(INFO) << "Opening file " << source; + std::ifstream infile(source.c_str()); + string filename; + int label; + while (infile >> filename >> label) { + lines_.push_back(std::make_pair(filename, label)); + } + + if (this->layer_param_.audio_data_param().shuffle()) { + // randomly shuffle data + LOG(INFO) << "Shuffling data"; + const unsigned int prefetch_rng_seed = caffe_rng_rand(); + prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed)); + ShuffleFiles(); + } + LOG(INFO) << "A total of " << lines_.size() << " files."; + + lines_id_ = 0; + + Datum datum; + datum.set_channels(1); + datum.set_height(1); + datum.set_width(this->layer_param_.audio_data_param().width()); + + // Use data_transformer to infer the expected blob shape from a datum. + vector top_shape = this->data_transformer_->InferBlobShape(datum); + this->transformed_data_.Reshape(top_shape); + // Reshape prefetch_data and top[0] according to the batch_size. + const int batch_size = this->layer_param_.audio_data_param().batch_size(); + top_shape[0] = batch_size; + for (int i = 0; i < this->PREFETCH_COUNT; ++i) { + this->prefetch_[i].data_.Reshape(top_shape); + } + top[0]->Reshape(top_shape); + + LOG(INFO) << "output data size: " << top[0]->num() << "," + << top[0]->channels() << "," << top[0]->height() << "," + << top[0]->width(); + // label + vector label_shape(1, batch_size); + top[1]->Reshape(label_shape); + for (int i = 0; i < this->PREFETCH_COUNT; ++i) { + this->prefetch_[i].label_.Reshape(label_shape); + } +} + +template +void AudioDataLayer::ShuffleFiles() { + caffe::rng_t* prefetch_rng = + static_cast(prefetch_rng_->generator()); + shuffle(lines_.begin(), lines_.end(), prefetch_rng); +} + +// This function is used to create a thread that prefetches the data. +template +void AudioDataLayer::load_batch(Batch* batch) { + CPUTimer batch_timer; + batch_timer.Start(); + double read_time = 0; + double trans_time = 0; + CPUTimer timer; + CHECK(batch->data_.count()); + CHECK(this->transformed_data_.count()); + AudioDataParameter audio_data_param = this->layer_param_.audio_data_param(); + const int batch_size = audio_data_param.batch_size(); + string root_folder = audio_data_param.root_folder(); + int width = static_cast(this->layer_param_.audio_data_param().width()); + + Datum datum; + datum.set_channels(1); + datum.set_height(1); + datum.set_width(width); + + // Use data_transformer to infer the expected blob shape from a datum. + vector top_shape = this->data_transformer_->InferBlobShape(datum); + this->transformed_data_.Reshape(top_shape); + // Reshape prefetch_data according to the batch_size. + top_shape[0] = batch_size; + batch->data_.Reshape(top_shape); + + Dtype* prefetch_data = batch->data_.mutable_cpu_data(); + Dtype* prefetch_label = batch->label_.mutable_cpu_data(); + + // datum scales + const int lines_size = lines_.size(); + for (int item_id = 0; item_id < batch_size; ++item_id) { + // get a blob + timer.Start(); + + Blob blob(1, 1, 1, width); + Dtype* data = blob.mutable_cpu_data(); + + ReadAudioFile(root_folder + lines_[lines_id_].first, data, width); + + read_time += timer.MicroSeconds(); + timer.Start(); + // Apply transformations to the audio + int offset = batch->data_.offset(item_id); + this->transformed_data_.set_cpu_data(prefetch_data + offset); + this->data_transformer_->Transform(&blob, &(this->transformed_data_)); + trans_time += timer.MicroSeconds(); + + prefetch_label[item_id] = lines_[lines_id_].second; + // go to the next iter + lines_id_++; + if (lines_id_ >= lines_size) { + // We have reached the end. Restart from the first. + DLOG(INFO) << "Restarting data prefetching from start."; + lines_id_ = 0; + if (this->layer_param_.audio_data_param().shuffle()) { + ShuffleFiles(); + } + } + } + batch_timer.Stop(); + DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms."; + DLOG(INFO) << " Read time: " << read_time / 1000 << " ms."; + DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms."; +} + +INSTANTIATE_CLASS(AudioDataLayer); +REGISTER_LAYER_CLASS(AudioData); + +} // namespace caffe +#endif // USE_AUDIO diff --git a/src/caffe/layers/spectrogram_layer.cpp b/src/caffe/layers/spectrogram_layer.cpp new file mode 100644 index 00000000000..c473206e808 --- /dev/null +++ b/src/caffe/layers/spectrogram_layer.cpp @@ -0,0 +1,83 @@ +#ifdef USE_AUDIO +#include + +#include "caffe/common_layers.hpp" +#include "caffe/util/fft.hpp" + +namespace caffe { + +template +void SpectrogramLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + window_size_ = this->layer_param_.spectrogram_param().window_size(); + step_size_ = this->layer_param_.spectrogram_param().step_size(); +} + +template +void SpectrogramLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + + const Dtype* bottom_labels = bottom[1]->cpu_data(); + Dtype* top_labels = top[1]->mutable_cpu_data(); + + caffe_copy(bottom[1]->count(), bottom_labels, top_labels); + + int n = bottom[0]->num(); + int w = bottom[0]->width(); + + int top_width = window_size_ / 2; + int top_height = (w - window_size_ + step_size_) / step_size_; + + vector top_shape(0); + top_shape.push_back(n); + top_shape.push_back(1); + top_shape.push_back(top_height); + top_shape.push_back(top_width); + + top[0]->Reshape(top_shape); + + FastFourierTransform_cpu fft(window_size_); + + int bottom_offset = 0; + int top_offset = 0; + for (int i = 0; i < n; ++i) { + for (int j = 0; j < top_height; ++j) { + fft.process(const_cast(bottom_data + bottom_offset), + top_data + top_offset, window_size_); + bottom_offset = bottom[0]->offset(i, 0, 0, j * step_size_); + top_offset = top[0]->offset(i, 0, j, 0); + } + } +} + +template +void SpectrogramLayer::Reshape(const vector *> &bottom, + const vector *> &top) { + top[1]->ReshapeLike(*bottom[1]); + + int n = bottom[0]->num(); + int w = bottom[0]->width(); + + int top_width = window_size_ / 2; + int top_height = (w - window_size_ + step_size_) / step_size_; + + vector top_shape(4); + top_shape[0] = n; + top_shape[1] = 1; + top_shape[2] = top_height; + top_shape[3] = top_width; + + top[0]->Reshape(top_shape); +} + +#ifdef CPU_ONLY + STUB_GPU_FORWARD(SpectrogramLayer, Forward); +#endif + + INSTANTIATE_CLASS(SpectrogramLayer); + REGISTER_LAYER_CLASS(Spectrogram); + +} // namespace caffe +#endif // USE_AUDIO diff --git a/src/caffe/layers/spectrogram_layer.cu b/src/caffe/layers/spectrogram_layer.cu new file mode 100644 index 00000000000..c7c5200fc29 --- /dev/null +++ b/src/caffe/layers/spectrogram_layer.cu @@ -0,0 +1,48 @@ +#ifdef USE_AUDIO +#include + +#include "caffe/common_layers.hpp" +#include "caffe/util/fft.hpp" + +namespace caffe { + +template +void SpectrogramLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + + int n = bottom[0]->num(); + int c = bottom[0]->channels(); + int h = bottom[0]->height(); + int w = bottom[0]->width(); + + int top_width = window_size_ / 2; + int top_height = (w - window_size_ + step_size_) / step_size_; + + vector top_shape(); + top_shape.push_back(n); + top_shape.push_back(1); + top_shape.push_back(top_height); + top_shape.push_back(top_width); + + top[0]->Reshape(top_shape); + + FastFourierTransform_gpu fft(window_size_); + + int bottom_offset = 0; + int top_offset = 0; + for (int i = 0; i < n; ++i) { + for (int j = 0; j < top_height; ++j) { + fft.process(const_cast(bottom_data + bottom_offset), + top_data + top_offset, window_size_); + bottom_offset = bottom[0]->offset(i, 0, 0, j * step_size_); + top_offset = top[0]->offset(i, 0, j, 0); + } + } +} + +INSTANTIATE_LAYER_GPU_FORWARD(SpectrogramLayer); + +} // namespace caffe +#endif // USE_AUDIO diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index a8747c12b37..8038247388d 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -301,7 +301,7 @@ message ParamSpec { // NOTE // Update the next available ID when you add a new LayerParameter field. // -// LayerParameter next available layer-specific ID: 139 (last added: tile_param) +// LayerParameter next available layer-specific ID: 141 (last added: spectrogram_param) message LayerParameter { optional string name = 1; // the layer name optional string type = 2; // the layer type @@ -350,6 +350,7 @@ message LayerParameter { // The default for the engine is set by the ENGINE switch at compile-time. optional AccuracyParameter accuracy_param = 102; optional ArgMaxParameter argmax_param = 103; + optional AudioDataParameter audio_data_param = 139; optional ConcatParameter concat_param = 104; optional ContrastiveLossParameter contrastive_loss_param = 105; optional ConvolutionParameter convolution_param = 106; @@ -379,6 +380,7 @@ message LayerParameter { optional ReshapeParameter reshape_param = 133; optional SigmoidParameter sigmoid_param = 124; optional SoftmaxParameter softmax_param = 125; + optional SpectrogramParameter spectrogram_param = 140; optional SPPParameter spp_param = 132; optional SliceParameter slice_param = 126; optional TanHParameter tanh_param = 127; @@ -450,6 +452,18 @@ message ArgMaxParameter { optional int32 axis = 3; } +message AudioDataParameter { + // Specify the data source. + required string source = 1; + // Specify the batch size. + optional uint32 batch_size = 2; + // Specify number of samples. + required uint32 width = 3; + // Whether or not AudioDataLayer should shuffle the list of files at every epoch. + optional bool shuffle = 4 [default = false]; + optional string root_folder = 5 [default = ""]; +} + message ConcatParameter { // The axis along which to concatenate -- may be negative to index from the // end (e.g., -1 for the last axis). Other axes must have the @@ -940,6 +954,11 @@ message SoftmaxParameter { optional int32 axis = 2 [default = 1]; } +message SpectrogramParameter { + required int32 window_size = 1; + required int32 step_size = 2; +} + message TanHParameter { enum Engine { DEFAULT = 0; diff --git a/src/caffe/util/fft.cpp b/src/caffe/util/fft.cpp new file mode 100644 index 00000000000..fe430ad67ba --- /dev/null +++ b/src/caffe/util/fft.cpp @@ -0,0 +1,130 @@ +#ifdef USE_AUDIO +#include "caffe/util/fft.hpp" + +#include + +#include +#include + +#include "caffe/common.hpp" +#include "caffe/util/math_functions.hpp" + + +namespace caffe { + +template +void hammingWindow(Dtype* data, int size); + +template +struct FastFourierTransformPImpl { + explicit FastFourierTransformPImpl(int size) : buffer(size + 2), + window(size) { + hammingWindow(&window[0], size); + } + std::valarray buffer; + std::valarray window; +}; + +template <> +FastFourierTransform_cpu::FastFourierTransform_cpu(int packetSize) +: _log2Size(std::ceil(std::log(packetSize) / std::log(2))), +_packetSize(static_cast(std::pow(2, _log2Size))), +_pimpl(new FastFourierTransformPImpl(packetSize)) +{} + +template <> +FastFourierTransform_cpu::FastFourierTransform_cpu(int packetSize) +: _log2Size(std::ceil(std::log(packetSize) / std::log(2))), +_packetSize(static_cast(std::pow(2, _log2Size))), +_pimpl(new FastFourierTransformPImpl(packetSize)) +{} + +template +FastFourierTransform_cpu::~FastFourierTransform_cpu() {} + +template <> +int FastFourierTransform_cpu::process(double* input_data, + double* output_data, int size) { + CHECK_LE(size, _packetSize); + + // Apply window to data + caffe_copy(size, input_data, &_pimpl->buffer[0]); + caffe_mul(size, &_pimpl->buffer[0], &_pimpl->window[0], + &_pimpl->buffer[0]); + + fftw_plan plan = fftw_plan_dft_r2c_1d(size, &_pimpl->buffer[0], + reinterpret_cast(&_pimpl->buffer[0]), + FFTW_ESTIMATE); + CHECK(plan) << "Could not create FFT plan."; + fftw_execute(plan); + fftw_destroy_plan(plan); + + // Normalize data + caffe_cvnrm(reinterpret_cast*>(&_pimpl->buffer[0]), + &_pimpl->buffer[0], size / 2); + caffe_scal(size, 1.0 / size, &_pimpl->buffer[0]); + + if (output_data) { + caffe_copy(size / 2, &_pimpl->buffer[0], output_data); + } else { + caffe_copy(size / 2, &_pimpl->buffer[0], input_data); + } + + return size; +} + +template <> +int FastFourierTransform_cpu::process(float* input_data, + float* output_data, int size) { + CHECK_LE(size, _packetSize); + + // Apply window to data + caffe_copy(size, input_data, &_pimpl->buffer[0]); + caffe_mul(size, &_pimpl->buffer[0], &_pimpl->window[0], + &_pimpl->buffer[0]); + + fftwf_plan plan = fftwf_plan_dft_r2c_1d(size, &_pimpl->buffer[0], + reinterpret_cast(&_pimpl->buffer[0]), + FFTW_ESTIMATE); + CHECK(plan) << "Could not create FFT plan."; + fftwf_execute(plan); + fftwf_destroy_plan(plan); + + // Normalize data + caffe_cvnrm(reinterpret_cast*>(&_pimpl->buffer[0]), + &_pimpl->buffer[0], size / 2); + caffe_scal(size, 1.0f / size, &_pimpl->buffer[0]); + + if (output_data) { + caffe_copy(size / 2, &_pimpl->buffer[0], output_data); + } else { + caffe_copy(size / 2, &_pimpl->buffer[0], input_data); + } + + return size; +} + +template <> +void hammingWindow(float* data, int size) { + const float alpha = 0.54; + const float beta = 0.46; + + for (int i = 0; i < size; ++i) { + data[i] = alpha - (beta * (2 * M_PI * i / (size - 1))); + } +} + +template <> +void hammingWindow(double* data, int size) { + const double alpha = 0.54; + const double beta = 0.46; + + for (int i = 0; i < size; ++i) { + data[i] = alpha - (beta * (2 * M_PI * i / (size - 1))); + } +} + +INSTANTIATE_CLASS(FastFourierTransform_cpu); + +} // namespace caffe +#endif // USE_AUDIO diff --git a/src/caffe/util/fft.cu b/src/caffe/util/fft.cu new file mode 100644 index 00000000000..17a65bd3129 --- /dev/null +++ b/src/caffe/util/fft.cu @@ -0,0 +1,92 @@ +#ifdef USE_AUDIO +#include "caffe/util/fft.hpp" + +#include +#include + +#include "caffe/common.hpp" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +template +struct FastFourierTransformPImpl { + cufftHandle plan; +}; + +template <> +FastFourierTransform_gpu::FastFourierTransform_gpu(int packetSize) +: _log2Size(std::ceil(std::log(packetSize) / std::log(2))), +_packetSize(static_cast(std::pow(2, _log2Size))), +_pimpl(new FastFourierTransformPImpl()) +{} + +template <> +FastFourierTransform_gpu::FastFourierTransform_gpu(int packetSize) +: _log2Size(std::ceil(std::log(packetSize) / std::log(2))), +_packetSize(static_cast(std::pow(2, _log2Size))), +_pimpl(new FastFourierTransformPImpl()) +{} + +template +FastFourierTransform_gpu::~FastFourierTransform_gpu() {} + +template <> +int FastFourierTransform_gpu::process(double* input_data, + double* output_data, int size) { + CHECK_EQ(size, _packetSize); + + if (output_data) { + caffe_copy(size, input_data, output_data); + } else { + output_data = input_data; + } + + CHECK_EQ(cufftPlan1d(&(_pimpl->plan), size, CUFFT_D2Z, 1), CUFFT_SUCCESS) + << "Creation of plan failed."; + CHECK_EQ(cufftExecD2Z(_pimpl->plan, + reinterpret_cast(output_data), + reinterpret_cast(output_data)), + CUFFT_SUCCESS) << "Execution of cuFFT failed."; + CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess) + << "CUDA failed to synchronize."; + CHECK_EQ(cufftDestroy(_pimpl->plan), CUFFT_SUCCESS) + << "Failed to destroy cuFFT."; + caffe_cvnrm(reinterpret_cast*>(output_data), + output_data, size); + caffe_scal(size, 1.0 / size, output_data); + + return size; +} + +template <> +int FastFourierTransform_gpu::process(float* input_data, + float* output_data, int size) { + CHECK_EQ(size, _packetSize); + + if (output_data) { + caffe_copy(size, input_data, output_data); + } else { + output_data = input_data; + } + + CHECK_EQ(cufftPlan1d(&(_pimpl->plan), size, CUFFT_R2C, 1), CUFFT_SUCCESS) + << "Creation of plan failed."; + CHECK_EQ(cufftExecR2C(_pimpl->plan, reinterpret_cast(output_data), + reinterpret_cast(output_data)), + CUFFT_SUCCESS) << "Execution of cuFFT failed."; + CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess) + << "CUDA failed to synchronize."; + CHECK_EQ(cufftDestroy(_pimpl->plan), CUFFT_SUCCESS) + << "Failed to destroy cuFFT."; + caffe_cvnrm(reinterpret_cast*>(output_data), + output_data, size); + caffe_scal(size, 1.0f / size, output_data); + + return size; +} + +INSTANTIATE_CLASS(FastFourierTransform_gpu); + +} // namespace caffe +#endif // USE_AUDIO diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp index 0aab6b17b85..781fdc447f3 100644 --- a/src/caffe/util/math_functions.cpp +++ b/src/caffe/util/math_functions.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include "caffe/common.hpp" @@ -68,6 +69,20 @@ template void caffe_set(const int N, const int alpha, int* Y); template void caffe_set(const int N, const float alpha, float* Y); template void caffe_set(const int N, const double alpha, double* Y); +template <> +void caffe_cvnrm(std::complex* input, double* output, int size) { + for (int i = 0; i < size; ++i) { + output[i] = std::abs(input[i]); + } +} + +template <> +void caffe_cvnrm(std::complex* input, float* output, int size) { + for (int i = 0; i < size; ++i) { + output[i] = std::abs(input[i]); + } +} + template <> void caffe_add_scalar(const int N, const float alpha, float* Y) { for (int i = 0; i < N; ++i) { diff --git a/src/caffe/util/read_audio.cpp b/src/caffe/util/read_audio.cpp new file mode 100644 index 00000000000..36457cdb96f --- /dev/null +++ b/src/caffe/util/read_audio.cpp @@ -0,0 +1,82 @@ +#ifdef USE_AUDIO +#include + +#include +#include + +#include "caffe/common.hpp" +#include "caffe/util/read_audio.hpp" + + +namespace caffe { + + int ReadAudioFile(const std::string& filePath, float* data, int capacity, + int offset) { + SF_INFO info = SF_INFO(); + + SNDFILE* file = sf_open(filePath.c_str(), SFM_READ, &info); + CHECK_EQ(sf_error(file), SF_ERR_NO_ERROR) << "Can't open file '" + << filePath << "': " << sf_strerror(file); + + sf_count_t status = sf_seek(file, offset, SEEK_SET); + CHECK_NE(status, -1) << "Can't seek to offset in: '" << filePath << + "': " << sf_strerror(file); + + sf_count_t numberOfFrames; + if (info.channels != 1) { + // Non-mono audio files will only have first channel read + std::valarray tempData(info.channels * capacity); + numberOfFrames = sf_read_float(file, &tempData[0], tempData.size()); + for (int i = 0; i < numberOfFrames / info.channels; ++i) { + data[i] = tempData[i * info.channels]; + } + } else { + numberOfFrames = sf_read_float(file, data, capacity); + } + + CHECK_EQ(numberOfFrames / info.channels, capacity) << + "File could not fill provided array"; + + status = sf_close(file); + CHECK_EQ(status, 0) << "Failed to close file: ''" << filePath << "': " + << sf_strerror(file); + + return numberOfFrames; + } + + int ReadAudioFile(const std::string& filePath, double* data, int capacity, + int offset) { + SF_INFO info = SF_INFO(); + + SNDFILE* file = sf_open(filePath.c_str(), SFM_READ, &info);\ + CHECK_EQ(sf_error(file), SF_ERR_NO_ERROR) << "Can't open file '" << + filePath << "': " << sf_strerror(file); + + sf_count_t status = sf_seek(file, offset, SEEK_SET); + CHECK_NE(status, -1) << "Can't seek to offset in: '" << filePath << + "': " << sf_strerror(file); + + sf_count_t numberOfFrames; + if (info.channels != 1) { + // Non-mono audio files will only have first channel read + std::valarray tempData(info.channels * capacity); + numberOfFrames = sf_read_double(file, &tempData[0], tempData.size()); + for (int i = 0; i < numberOfFrames / info.channels; ++i) { + data[i] = tempData[i * info.channels]; + } + } else { + numberOfFrames = sf_read_double(file, data, capacity); + } + + CHECK_EQ(numberOfFrames / info.channels, capacity) << + "File could not fill provided array"; + + status = sf_close(file); + CHECK_EQ(status, 0) << "Failed to close file: ''" << filePath << "': " + << sf_strerror(file); + + return numberOfFrames; + } + +} // namespace caffe +#endif // USE_AUDIO