diff --git a/.travis.yml b/.travis.yml
index 4dc7ed72d6c..47c7d0b65aa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,12 +2,18 @@
 # one using CMake, and one using make.
 env:
   matrix:
-    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=true
-    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=true PYTHON_VERSION=3
-    - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true
-    - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true
-    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=false
-    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3
+    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=true WITH_AUDIO=false
+    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=true WITH_AUDIO=true
+    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=true PYTHON_VERSION=3 WITH_AUDIO=false
+    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=true PYTHON_VERSION=3 WITH_AUDIO=true
+    - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true WITH_AUDIO=false
+    - WITH_CUDA=true WITH_CMAKE=false WITH_IO=true WITH_AUDIO=true
+    - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true WITH_AUDIO=false
+    - WITH_CUDA=true WITH_CMAKE=true WITH_IO=true WITH_AUDIO=true
+    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=false WITH_AUDIO=false
+    - WITH_CUDA=false WITH_CMAKE=false WITH_IO=false WITH_AUDIO=true
+    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3 WITH_AUDIO=false
+    - WITH_CUDA=false WITH_CMAKE=true WITH_IO=false PYTHON_VERSION=3 WITH_AUDIO=true
 
 language: cpp
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37f937fe489..c3840d0e5f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,7 @@ caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON)
 caffe_option(USE_LMDB "Build with lmdb" ON)
 caffe_option(USE_LEVELDB "Build with levelDB" ON)
 caffe_option(USE_OPENCV "Build with OpenCV support" ON)
+caffe_option(USE_AUDIO "Build with audio data support" OFF)
 
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)
diff --git a/Makefile b/Makefile
index 5fb6394e947..2a59003cfed 100644
--- a/Makefile
+++ b/Makefile
@@ -186,6 +186,9 @@ endif
 ifeq ($(USE_OPENCV), 1)
 	LIBRARIES += opencv_core opencv_highgui opencv_imgproc
 endif
+ifeq ($(USE_AUDIO), 1)
+	LIBRARIES += sndfile fftw3 fftw3f
+endif
 PYTHON_LIBRARIES := boost_python python2.7
 WARNINGS := -Wall -Wno-sign-compare
 
@@ -314,6 +317,9 @@ endif
 ifeq ($(USE_LMDB), 1)
 	COMMON_FLAGS += -DUSE_LMDB
 endif
+ifeq ($(USE_AUDIO), 1)
+	COMMON_FLAGS += -DUSE_AUDIO
+endif
 
 # CPU-only configuration
 ifeq ($(CPU_ONLY), 1)
diff --git a/Makefile.config.example b/Makefile.config.example
index a20bad2f5ce..e1cbcb3ce54 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -73,6 +73,9 @@ PYTHON_LIB := /usr/lib
 # Uncomment to support layers written in Python (will link against Python libs)
 # WITH_PYTHON_LAYER := 1
 
+# Uncomment to support reading from audio files
+# USE_AUDIO := 1
+
 # Whatever else you find you need goes here.
 INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include
 LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index 8b259965359..9f6395ae314 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -68,6 +68,10 @@ function(caffe_generate_export_configs)
     list(APPEND Caffe_DEFINITIONS -DUSE_LEVELDB)
   endif()
 
+  if(USE_AUDIO)
+    list(APPEND Caffe_DEFINITIONS -DUSE_AUDIO)
+  endif()
+
   if(NOT HAVE_CUDNN)
     set(HAVE_CUDNN FALSE)
   else()
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index ff58d31c166..42b42e543d6 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -189,6 +189,26 @@ function(detect_cuDNN)
 endfunction()
 
 
+################################################################################################
+
+# Include cuFFT
+set(CUFFT_ROOT "" CACHE PATH "CUFTT root folder")
+
+find_path(CUFFT_INCLUDE cufft.h
+          PATHS ${CUFFT_ROOT} $ENV{CUFFT_ROOT} ${CUDA_TOOLKIT_INCLUDE}
+          DOC "Path to cuFTT include directory." )
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+find_library(CUFFT_LIBRARY NAMES libcufft.so # libcufft_static.a
+                           PATHS ${CUFFT_ROOT} $ENV{CUFFT_ROOT} ${CUFFT_INCLUDE} ${__libpath_hist}
+                           DOC "Path to cuFFT library.")
+
+if(CUFFT_INCLUDE AND CUFFT_LIBRARY)
+  include_directories(SYSTEM ${CUFFT_INCLUDE})
+  list(APPEND Caffe_LINKER_LIBS ${CUFFT_LIBRARY})
+  message(WARNING "Found cuFFT (include: ${CUFFT_INCLUDE}, library: ${CUFFT_LIBRARY})")
+endif()
+
 ################################################################################################
 ###  Non macro section
 ################################################################################################
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index d68d7bfba66..e7f5f1407b7 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -51,6 +51,25 @@ if(USE_LEVELDB)
   list(APPEND Caffe_LINKER_LIBS ${Snappy_LIBRARIES})
 endif()
 
+
+if(USE_AUDIO)
+# ---[ SNDFILE
+  find_package(SNDFILE REQUIRED)
+  if(SNDFILE_FOUND)
+    include_directories(SYSTEM ${SNDFILE_INCLUDE_DIR})
+    list(APPEND Caffe_LINKER_LIBS ${SNDFILE_LIBRARIES})
+  endif()
+
+# ---[ FFT
+  find_package(FFTW QUIET)
+  if(FFTW_FOUND)
+    include_directories(SYSTEM ${FFTW_INCLUDE_DIR})
+    list(APPEND Caffe_LINKER_LIBS ${FFTW_LIBRARIES})
+  endif()
+
+  add_definitions(-DUSE_AUDIO)
+endif()
+
 # ---[ CUDA
 include(cmake/Cuda.cmake)
 if(NOT HAVE_CUDA)
@@ -110,18 +129,18 @@ if(BUILD_python)
     find_package(NumPy 1.7.1)
     # Find the matching boost python implementation
     set(version ${PYTHONLIBS_VERSION_STRING})
-    
+
     STRING( REPLACE "." "" boost_py_version ${version} )
     find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
     set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
-    
+
     while(NOT "${version}" STREQUAL "" AND NOT Boost_PYTHON_FOUND)
       STRING( REGEX REPLACE "([0-9.]+).[0-9]+" "\\1" version ${version} )
-      
+
       STRING( REPLACE "." "" boost_py_version ${version} )
       find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
       set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
-      
+
       STRING( REGEX MATCHALL "([0-9.]+).[0-9]+" has_more_version ${version} )
       if("${has_more_version}" STREQUAL "")
         break()
diff --git a/cmake/Modules/FindFFTW.cmake b/cmake/Modules/FindFFTW.cmake
new file mode 100644
index 00000000000..a0683586545
--- /dev/null
+++ b/cmake/Modules/FindFFTW.cmake
@@ -0,0 +1,23 @@
+# - Find fftw
+# Find the native fftw includes and libraries
+#
+#  FFTW_INCLUDE_DIR - where to find fftw3.h, etc.
+#  FFTW_LIBRARIES   - List of libraries when using fttw.
+#  FFTW_FOUND       - True if fftw found.
+
+FIND_PATH(FFTW_INCLUDE_DIR NAMES fftw3.h PATHS /opt/local/include /usr/local/include /usr/include)
+
+FIND_LIBRARY(FFTW_LIBRARY NAMES fftw3 PATHS /usr/lib /usr/local/lib)
+FIND_LIBRARY(FFTWF_LIBRARY NAMES fftw3f PATHS /usr/lib /usr/local/lib)
+
+SET(FFTW_LIBRARIES ${FFTW_LIBRARY} ${FFTWF_LIBRARY})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(FFTW DEFAULT_MSG
+    FFTW_INCLUDE_DIR FFTW_LIBRARIES)
+
+if(FFTW_FOUND)
+  set(FFTW_LIBRARIES ${FFTW_LIBRARIES})
+  message(STATUS "Found FFTW  (include: ${FFTW_INCLUDE_DIR}, library: ${FFTW_LIBRARIES})")
+  mark_as_advanced(FFTW_INCLUDE_DIR FFTW_LIBRARIES)
+endif(FFTW_FOUND)
diff --git a/cmake/Modules/FindSNDFILE.cmake b/cmake/Modules/FindSNDFILE.cmake
new file mode 100644
index 00000000000..55942e77b45
--- /dev/null
+++ b/cmake/Modules/FindSNDFILE.cmake
@@ -0,0 +1,20 @@
+# - Find sndfile
+# Find the native sndfile includes and libraries
+#
+#  SNDFILE_INCLUDE_DIR - where to find sndfile.h, etc.
+#  SNDFILE_LIBRARIES   - List of libraries when using libsndfile.
+#  SNDFILE_FOUND       - True if libsndfile found.
+
+FIND_PATH(SNDFILE_INCLUDE_DIR NAMES sndfile.h PATHS $ENV{LEVELDB_ROOT}/include /opt/local/include /usr/local/include /usr/include)
+
+FIND_LIBRARY(SNDFILE_LIBRARIES NAMES sndfile PATHS /usr/local/lib /usr/lib $ENV{LEVELDB_ROOT}/lib)
+
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(SNDFILE DEFAULT_MSG
+    SNDFILE_INCLUDE_DIR SNDFILE_LIBRARIES)
+
+if(SNDFILE_FOUND)
+  message(STATUS "Found LibSndFile  (include: ${SNDFILE_INCLUDE_DIR}, library: ${SNDFILE_LIBRARIES})")
+  mark_as_advanced(SNDFILE_INCLUDE_DIR SNDFILE_LIBRARIES)
+endif()
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 3d12e81a130..77681b32b8b 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -117,6 +117,7 @@ function(caffe_print_configuration_summary)
   caffe_status("  USE_LMDB          :   ${USE_LMDB}")
   caffe_status("  USE_LEVELDB       :   ${USE_LEVELDB}")
   caffe_status("  USE_OPENCV        :   ${USE_OPENCV}")
+  caffe_status("  USE_AUDIO         :   ${USE_AUDIO}")
   caffe_status("")
   caffe_status("Dependencies:")
   caffe_status("  BLAS              : " APPLE THEN "Yes (vecLib)" ELSE "Yes (${BLAS})")
@@ -134,6 +135,10 @@ function(caffe_print_configuration_summary)
   if(USE_OPENCV)
     caffe_status("  OpenCV            :   Yes (ver. ${OpenCV_VERSION})")
   endif()
+  if(USE_AUDIO)
+    caffe_status("  libsndfile        : " SNDFILE_FOUND THEN "Yes" ELSE "No")
+    caffe_status("  fftw              : " FFTW_FOUND THEN "Yes" ELSE "No")
+  endif()
   caffe_status("  CUDA              : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" )
   caffe_status("")
   if(HAVE_CUDA)
diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 9302022d7da..4a57c1fac00 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -35,3 +35,4 @@
 #cmakedefine USE_OPENCV
 #cmakedefine USE_LMDB
 #cmakedefine USE_LEVELDB
+#cmakedefine USE_AUDIO
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index 21a27d759a8..78aa1d5d9b6 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -624,6 +624,47 @@ class SoftmaxLayer : public Layer<Dtype> {
   Blob<Dtype> scale_;
 };
 
+#ifdef USE_AUDIO
+/**
+ * @brief Computes a spectogram of input data.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class SpectrogramLayer : public Layer<Dtype> {
+ public:
+    explicit SpectrogramLayer(const LayerParameter& param)
+    : Layer<Dtype>(param) {}
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+                            const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+                         const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const { return "Spectrogram"; }
+    virtual inline int ExactNumBottomBlobs() const { return 2; }
+    virtual inline int ExactNumTopBlobs() const { return 2; }
+
+ protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+                             const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+                             const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+                              const vector<bool>& propagate_down,
+                              const vector<Blob<Dtype>*>& bottom) {
+        NOT_IMPLEMENTED;
+    }
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+                              const vector<bool>& propagate_down,
+                              const vector<Blob<Dtype>*>& bottom) {
+        NOT_IMPLEMENTED;
+    }
+
+    int window_size_;
+    int step_size_;
+};
+#endif
+
 #ifdef USE_CUDNN
 /**
  * @brief cuDNN implementation of SoftmaxLayer.
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index 90fd0d19917..aa94429d617 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -256,6 +256,35 @@ class ImageDataLayer : public BasePrefetchingDataLayer<Dtype> {
   int lines_id_;
 };
 
+#ifdef USE_AUDIO
+/**
+ * @brief Provides data to the Net from audio files.
+ *
+ * TODO(dox): thorough documentation for Forward and proto params.
+ */
+template <typename Dtype>
+class AudioDataLayer : public BasePrefetchingDataLayer<Dtype> {
+ public:
+  explicit AudioDataLayer(const LayerParameter& param)
+      : BasePrefetchingDataLayer<Dtype>(param) {}
+  virtual ~AudioDataLayer();
+  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "AudioData"; }
+  virtual inline int ExactNumBottomBlobs() const { return 0; }
+  virtual inline int ExactNumTopBlobs() const { return 2; }
+
+ protected:
+  shared_ptr<Caffe::RNG> prefetch_rng_;
+  virtual void ShuffleFiles();
+  virtual void load_batch(Batch<Dtype>* batch);
+
+  vector<std::pair<std::string, int> > lines_;
+  int lines_id_;
+};
+#endif
+
 /**
  * @brief Provides data to the Net from memory.
  *
diff --git a/include/caffe/util/fft.hpp b/include/caffe/util/fft.hpp
new file mode 100644
index 00000000000..cc0c658c96c
--- /dev/null
+++ b/include/caffe/util/fft.hpp
@@ -0,0 +1,48 @@
+#ifdef USE_AUDIO
+#ifndef CAFFE_UTIL_FFT_HPP
+#define CAFFE_UTIL_FFT_HPP
+
+#include <boost/scoped_ptr.hpp>
+
+#include "caffe/proto/caffe.pb.h"
+
+
+namespace caffe {
+
+template <typename Dtype>
+struct FastFourierTransformPImpl;
+
+template <typename Dtype>
+class FastFourierTransform_cpu {
+ public:
+  explicit FastFourierTransform_cpu(int packetSize);
+  ~FastFourierTransform_cpu();
+
+  int process(Dtype* input_data, Dtype* output_data, int size);
+
+ private:
+  const int _log2Size;
+  const int _packetSize;
+
+  boost::scoped_ptr<FastFourierTransformPImpl<Dtype> > _pimpl;
+};
+
+template <typename Dtype>
+class FastFourierTransform_gpu {
+ public:
+  explicit FastFourierTransform_gpu(int packetSize);
+  ~FastFourierTransform_gpu();
+
+  int process(Dtype* input_data, Dtype* output_data, int size);
+
+ private:
+  const int _log2Size;
+  const int _packetSize;
+
+  boost::scoped_ptr<FastFourierTransformPImpl<Dtype> > _pimpl;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_FFT_HPP
+#endif  // USE_AUDIO
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 2cacd8e72cd..792005fd6f6 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -3,6 +3,7 @@
 
 #include <stdint.h>
 #include <cmath>  // for std::fabs and std::signbit
+#include <complex>
 
 #include "glog/logging.h"
 
@@ -43,6 +44,9 @@ inline void caffe_memset(const size_t N, const int alpha, void* X) {
   memset(X, alpha, N);  // NOLINT(caffe/alt_fn)
 }
 
+template <typename Dtype>
+void caffe_cvnrm(std::complex<Dtype>* input, Dtype* output, int size);
+
 template <typename Dtype>
 void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
 
diff --git a/include/caffe/util/read_audio.hpp b/include/caffe/util/read_audio.hpp
new file mode 100644
index 00000000000..62a37a6cd56
--- /dev/null
+++ b/include/caffe/util/read_audio.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <string>
+
+namespace caffe {
+
+int ReadAudioFile(const std::string& filePath, float* data, int capacity,
+  int offset = 0);
+int ReadAudioFile(const std::string& filePath, double* data, int capacity,
+  int offset = 0);
+
+}  // namespace caffe
diff --git a/scripts/cpp_lint.py b/scripts/cpp_lint.py
index f750489f4f9..c1878fc91ef 100755
--- a/scripts/cpp_lint.py
+++ b/scripts/cpp_lint.py
@@ -1609,6 +1609,7 @@ def CheckCaffeDataLayerSetUp(filename, clean_lines, linenum, error):
   ix = line.find('DataLayer<Dtype>::LayerSetUp')
   if ix >= 0 and (
        line.find('void DataLayer<Dtype>::LayerSetUp') != -1 or
+       line.find('void AudioDataLayer<Dtype>::LayerSetUp') != -1 or
        line.find('void ImageDataLayer<Dtype>::LayerSetUp') != -1 or
        line.find('void MemoryDataLayer<Dtype>::LayerSetUp') != -1 or
        line.find('void WindowDataLayer<Dtype>::LayerSetUp') != -1):
@@ -1621,6 +1622,7 @@ def CheckCaffeDataLayerSetUp(filename, clean_lines, linenum, error):
   if ix >= 0 and (
        line.find('void Base') == -1 and
        line.find('void DataLayer<Dtype>::DataLayerSetUp') == -1 and
+       line.find('void AudioDataLayer<Dtype>::DataLayerSetUp') == -1 and
        line.find('void ImageDataLayer<Dtype>::DataLayerSetUp') == -1 and
        line.find('void MemoryDataLayer<Dtype>::DataLayerSetUp') == -1 and
        line.find('void WindowDataLayer<Dtype>::DataLayerSetUp') == -1):
diff --git a/scripts/travis/travis_build_and_test.sh b/scripts/travis/travis_build_and_test.sh
index 174f1ee5a0a..2bdc1f490af 100755
--- a/scripts/travis/travis_build_and_test.sh
+++ b/scripts/travis/travis_build_and_test.sh
@@ -10,6 +10,7 @@ if $WITH_CMAKE; then
   cd build
   CPU_ONLY=" -DCPU_ONLY=ON"
   if ! $WITH_CUDA; then
+    echo "********* CPU ONLY **********"
     CPU_ONLY=" -DCPU_ONLY=OFF"
   fi
   PYTHON_ARGS=""
@@ -21,7 +22,10 @@ if $WITH_CMAKE; then
   else
     IO_ARGS="-DUSE_OPENCV=OFF -DUSE_LMDB=OFF -DUSE_LEVELDB=OFF"
   fi
-  cmake -DBUILD_python=ON -DCMAKE_BUILD_TYPE=Release $CPU_ONLY $PYTHON_ARGS -DCMAKE_INCLUDE_PATH="$CONDA_DIR/include/" -DCMAKE_LIBRARY_PATH="$CONDA_DIR/lib/" $IO_ARGS ..
+  if $WITH_AUDIO; then
+    AUDIO_ARGS="-DUSE_AUDIO=1"
+  fi
+  cmake -DBUILD_python=ON -DCMAKE_BUILD_TYPE=Release $CPU_ONLY $PYTHON_ARGS -DCMAKE_INCLUDE_PATH="$CONDA_DIR/include/" -DCMAKE_LIBRARY_PATH="$CONDA_DIR/lib/" $IO_ARGS $AUDIO_ARGS ..
   $MAKE
   $MAKE pytest
   if ! $WITH_CUDA; then
@@ -39,6 +43,9 @@ else
     export USE_LEVELDB=1
     export USE_OPENCV=1
   fi
+  if $WITH_AUDIO; then
+    export USE_AUDIO=1
+  fi
   $MAKE all test pycaffe warn lint || true
   if ! $WITH_CUDA; then
     $MAKE runtest
diff --git a/scripts/travis/travis_install.sh b/scripts/travis/travis_install.sh
index d6c6e228b58..4c72c42d09b 100755
--- a/scripts/travis/travis_install.sh
+++ b/scripts/travis/travis_install.sh
@@ -68,7 +68,7 @@ if [ ! -d $CONDA_DIR ]; then
 	fi
 	chmod +x miniconda.sh
 	./miniconda.sh -b -p $CONDA_DIR
-	
+
 	conda update --yes conda
 	conda install --yes numpy scipy matplotlib scikit-image pip
 	# Let conda install boost (so that boost_python matches)
@@ -93,3 +93,9 @@ if [ "$PYTHON_VERSION" -eq "3" ]; then
 else
 	pip install protobuf
 fi
+
+# install audio dependencies
+if $WITH_AUDIO; then
+apt-get -y update
+apt-get install libfftw3-dev libsndfile-dev
+fi
diff --git a/scripts/travis/travis_setup_makefile_config.sh b/scripts/travis/travis_setup_makefile_config.sh
index 83aacf11fb0..ef2aab40706 100755
--- a/scripts/travis/travis_setup_makefile_config.sh
+++ b/scripts/travis/travis_setup_makefile_config.sh
@@ -16,6 +16,7 @@ fi
 sed -i -e '/USE_LMDB/d' Makefile.config
 sed -i -e '/USE_LEVELDB/d' Makefile.config
 sed -i -e '/USE_OPENCV/d' Makefile.config
+sed -i -e '/USE_AUDIO/d' Makefile.config
 
 cat << 'EOF' >> Makefile.config
 # Travis' nvcc doesn't like newer boost versions
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index 40e6c11f5b0..3e675c20d77 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -32,5 +32,3 @@ install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib)
 file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
 list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
 install(PROGRAMS ${proto_python} DESTINATION python/caffe/proto)
-
-
diff --git a/src/caffe/layers/audio_data_layer.cpp b/src/caffe/layers/audio_data_layer.cpp
new file mode 100644
index 00000000000..72b03838da7
--- /dev/null
+++ b/src/caffe/layers/audio_data_layer.cpp
@@ -0,0 +1,156 @@
+#ifdef USE_AUDIO
+#include <opencv2/core/core.hpp>
+
+#include <fstream>  // NOLINT(readability/streams)
+#include <iostream>  // NOLINT(readability/streams)
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/data_layers.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/benchmark.hpp"
+#include "caffe/util/io.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/util/read_audio.hpp"
+#include "caffe/util/rng.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+AudioDataLayer<Dtype>::~AudioDataLayer<Dtype>() {
+  this->StopInternalThread();
+}
+
+template <typename Dtype>
+void AudioDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  string root_folder = this->layer_param_.audio_data_param().root_folder();
+
+  // Read the file with filenames and labels
+  const string& source = this->layer_param_.audio_data_param().source();
+  LOG(INFO) << "Opening file " << source;
+  std::ifstream infile(source.c_str());
+  string filename;
+  int label;
+  while (infile >> filename >> label) {
+    lines_.push_back(std::make_pair(filename, label));
+  }
+
+  if (this->layer_param_.audio_data_param().shuffle()) {
+    // randomly shuffle data
+    LOG(INFO) << "Shuffling data";
+    const unsigned int prefetch_rng_seed = caffe_rng_rand();
+    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
+    ShuffleFiles();
+  }
+  LOG(INFO) << "A total of " << lines_.size() << " files.";
+
+  lines_id_ = 0;
+
+  Datum datum;
+  datum.set_channels(1);
+  datum.set_height(1);
+  datum.set_width(this->layer_param_.audio_data_param().width());
+
+  // Use data_transformer to infer the expected blob shape from a datum.
+  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
+  this->transformed_data_.Reshape(top_shape);
+  // Reshape prefetch_data and top[0] according to the batch_size.
+  const int batch_size = this->layer_param_.audio_data_param().batch_size();
+  top_shape[0] = batch_size;
+  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
+      this->prefetch_[i].data_.Reshape(top_shape);
+  }
+  top[0]->Reshape(top_shape);
+
+  LOG(INFO) << "output data size: " << top[0]->num() << ","
+      << top[0]->channels() << "," << top[0]->height() << ","
+      << top[0]->width();
+  // label
+  vector<int> label_shape(1, batch_size);
+  top[1]->Reshape(label_shape);
+  for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
+    this->prefetch_[i].label_.Reshape(label_shape);
+  }
+}
+
+template <typename Dtype>
+void AudioDataLayer<Dtype>::ShuffleFiles() {
+  caffe::rng_t* prefetch_rng =
+  static_cast<caffe::rng_t*>(prefetch_rng_->generator());
+  shuffle(lines_.begin(), lines_.end(), prefetch_rng);
+}
+
+// This function is used to create a thread that prefetches the data.
+template <typename Dtype>
+void AudioDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
+  CPUTimer batch_timer;
+  batch_timer.Start();
+  double read_time = 0;
+  double trans_time = 0;
+  CPUTimer timer;
+  CHECK(batch->data_.count());
+  CHECK(this->transformed_data_.count());
+  AudioDataParameter audio_data_param = this->layer_param_.audio_data_param();
+  const int batch_size = audio_data_param.batch_size();
+  string root_folder = audio_data_param.root_folder();
+  int width = static_cast<int>(this->layer_param_.audio_data_param().width());
+
+  Datum datum;
+  datum.set_channels(1);
+  datum.set_height(1);
+  datum.set_width(width);
+
+  // Use data_transformer to infer the expected blob shape from a datum.
+  vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
+  this->transformed_data_.Reshape(top_shape);
+  // Reshape prefetch_data according to the batch_size.
+  top_shape[0] = batch_size;
+  batch->data_.Reshape(top_shape);
+
+  Dtype* prefetch_data = batch->data_.mutable_cpu_data();
+  Dtype* prefetch_label = batch->label_.mutable_cpu_data();
+
+  // datum scales
+  const int lines_size = lines_.size();
+  for (int item_id = 0; item_id < batch_size; ++item_id) {
+    // get a blob
+    timer.Start();
+
+    Blob<Dtype> blob(1, 1, 1, width);
+    Dtype* data = blob.mutable_cpu_data();
+
+    ReadAudioFile(root_folder + lines_[lines_id_].first, data, width);
+
+    read_time += timer.MicroSeconds();
+    timer.Start();
+    // Apply transformations to the audio
+    int offset = batch->data_.offset(item_id);
+    this->transformed_data_.set_cpu_data(prefetch_data + offset);
+    this->data_transformer_->Transform(&blob, &(this->transformed_data_));
+    trans_time += timer.MicroSeconds();
+
+    prefetch_label[item_id] = lines_[lines_id_].second;
+    // go to the next iter
+    lines_id_++;
+    if (lines_id_ >= lines_size) {
+      // We have reached the end. Restart from the first.
+      DLOG(INFO) << "Restarting data prefetching from start.";
+      lines_id_ = 0;
+      if (this->layer_param_.audio_data_param().shuffle()) {
+        ShuffleFiles();
+      }
+    }
+  }
+  batch_timer.Stop();
+  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
+  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
+  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
+}
+
+INSTANTIATE_CLASS(AudioDataLayer);
+REGISTER_LAYER_CLASS(AudioData);
+
+}  // namespace caffe
+#endif  // USE_AUDIO
diff --git a/src/caffe/layers/spectrogram_layer.cpp b/src/caffe/layers/spectrogram_layer.cpp
new file mode 100644
index 00000000000..c473206e808
--- /dev/null
+++ b/src/caffe/layers/spectrogram_layer.cpp
@@ -0,0 +1,83 @@
+#ifdef USE_AUDIO
+#include <vector>
+
+#include "caffe/common_layers.hpp"
+#include "caffe/util/fft.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SpectrogramLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+                                     const vector<Blob<Dtype>*>& top) {
+    window_size_ = this->layer_param_.spectrogram_param().window_size();
+    step_size_ = this->layer_param_.spectrogram_param().step_size();
+}
+
+template <typename Dtype>
+void SpectrogramLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+                                      const vector<Blob<Dtype>*>& top) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    Dtype* top_data = top[0]->mutable_cpu_data();
+
+    const Dtype* bottom_labels = bottom[1]->cpu_data();
+    Dtype* top_labels = top[1]->mutable_cpu_data();
+
+    caffe_copy(bottom[1]->count(), bottom_labels, top_labels);
+
+    int n = bottom[0]->num();
+    int w = bottom[0]->width();
+
+    int top_width = window_size_ / 2;
+    int top_height = (w - window_size_ + step_size_) / step_size_;
+
+    vector<int> top_shape(0);
+    top_shape.push_back(n);
+    top_shape.push_back(1);
+    top_shape.push_back(top_height);
+    top_shape.push_back(top_width);
+
+    top[0]->Reshape(top_shape);
+
+    FastFourierTransform_cpu<Dtype> fft(window_size_);
+
+    int bottom_offset = 0;
+    int top_offset = 0;
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < top_height; ++j) {
+            fft.process(const_cast<Dtype*>(bottom_data + bottom_offset),
+                        top_data + top_offset, window_size_);
+            bottom_offset = bottom[0]->offset(i, 0, 0, j * step_size_);
+            top_offset = top[0]->offset(i, 0, j, 0);
+        }
+    }
+}
+
+template <typename Dtype>
+void SpectrogramLayer<Dtype>::Reshape(const vector<Blob<Dtype> *> &bottom,
+                                      const vector<Blob<Dtype> *> &top) {
+    top[1]->ReshapeLike(*bottom[1]);
+
+    int n = bottom[0]->num();
+    int w = bottom[0]->width();
+
+    int top_width = window_size_ / 2;
+    int top_height = (w - window_size_ + step_size_) / step_size_;
+
+    vector<int> top_shape(4);
+    top_shape[0] = n;
+    top_shape[1] = 1;
+    top_shape[2] = top_height;
+    top_shape[3] = top_width;
+
+    top[0]->Reshape(top_shape);
+}
+
+#ifdef CPU_ONLY
+    STUB_GPU_FORWARD(SpectrogramLayer, Forward);
+#endif
+
+    INSTANTIATE_CLASS(SpectrogramLayer);
+    REGISTER_LAYER_CLASS(Spectrogram);
+
+}  // namespace caffe
+#endif  // USE_AUDIO
diff --git a/src/caffe/layers/spectrogram_layer.cu b/src/caffe/layers/spectrogram_layer.cu
new file mode 100644
index 00000000000..c7c5200fc29
--- /dev/null
+++ b/src/caffe/layers/spectrogram_layer.cu
@@ -0,0 +1,48 @@
+#ifdef USE_AUDIO
+#include <vector>
+
+#include "caffe/common_layers.hpp"
+#include "caffe/util/fft.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SpectrogramLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+                                          const vector<Blob<Dtype>*>& top) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    Dtype* top_data = top[0]->mutable_gpu_data();
+
+    int n = bottom[0]->num();
+    int c = bottom[0]->channels();
+    int h = bottom[0]->height();
+    int w = bottom[0]->width();
+
+    int top_width = window_size_ / 2;
+    int top_height = (w - window_size_ + step_size_) / step_size_;
+
+    vector<int> top_shape();
+    top_shape.push_back(n);
+    top_shape.push_back(1);
+    top_shape.push_back(top_height);
+    top_shape.push_back(top_width);
+    
+    top[0]->Reshape(top_shape);
+
+    FastFourierTransform_gpu<Dtype> fft(window_size_);
+
+    int bottom_offset = 0;
+    int top_offset = 0;
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < top_height; ++j) {
+            fft.process(const_cast<Dtype*>(bottom_data + bottom_offset),
+                        top_data + top_offset, window_size_);
+            bottom_offset = bottom[0]->offset(i, 0, 0, j * step_size_);
+            top_offset = top[0]->offset(i, 0, j, 0);
+        }
+    }
+}
+
+INSTANTIATE_LAYER_GPU_FORWARD(SpectrogramLayer);
+
+}  // namespace caffe
+#endif  // USE_AUDIO
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index a8747c12b37..8038247388d 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -301,7 +301,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 139 (last added: tile_param)
+// LayerParameter next available layer-specific ID: 141 (last added: spectrogram_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -350,6 +350,7 @@ message LayerParameter {
   // The default for the engine is set by the ENGINE switch at compile-time.
   optional AccuracyParameter accuracy_param = 102;
   optional ArgMaxParameter argmax_param = 103;
+  optional AudioDataParameter audio_data_param = 139;
   optional ConcatParameter concat_param = 104;
   optional ContrastiveLossParameter contrastive_loss_param = 105;
   optional ConvolutionParameter convolution_param = 106;
@@ -379,6 +380,7 @@ message LayerParameter {
   optional ReshapeParameter reshape_param = 133;
   optional SigmoidParameter sigmoid_param = 124;
   optional SoftmaxParameter softmax_param = 125;
+  optional SpectrogramParameter spectrogram_param = 140;
   optional SPPParameter spp_param = 132;
   optional SliceParameter slice_param = 126;
   optional TanHParameter tanh_param = 127;
@@ -450,6 +452,18 @@ message ArgMaxParameter {
   optional int32 axis = 3;
 }
 
+message AudioDataParameter {
+  // Specify the data source.
+  required string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 2;
+  // Specify number of samples.
+  required uint32 width = 3;
+  // Whether or not AudioDataLayer should shuffle the list of files at every epoch.
+  optional bool shuffle = 4 [default = false];
+  optional string root_folder = 5 [default = ""];
+}
+
 message ConcatParameter {
   // The axis along which to concatenate -- may be negative to index from the
   // end (e.g., -1 for the last axis).  Other axes must have the
@@ -940,6 +954,11 @@ message SoftmaxParameter {
   optional int32 axis = 2 [default = 1];
 }
 
+message SpectrogramParameter {
+  required int32 window_size = 1;
+  required int32 step_size = 2;
+}
+
 message TanHParameter {
   enum Engine {
     DEFAULT = 0;
diff --git a/src/caffe/util/fft.cpp b/src/caffe/util/fft.cpp
new file mode 100644
index 00000000000..fe430ad67ba
--- /dev/null
+++ b/src/caffe/util/fft.cpp
@@ -0,0 +1,130 @@
+#ifdef USE_AUDIO
+#include "caffe/util/fft.hpp"
+
+#include <fftw3.h>
+
+#include <cmath>
+#include <valarray>
+
+#include "caffe/common.hpp"
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+void hammingWindow(Dtype* data, int size);
+
+template <typename Dtype>
+struct FastFourierTransformPImpl {
+  explicit FastFourierTransformPImpl(int size) : buffer(size + 2),
+                                                 window(size) {
+    hammingWindow(&window[0], size);
+  }
+  std::valarray<Dtype> buffer;
+  std::valarray<Dtype> window;
+};
+
+template <>
+FastFourierTransform_cpu<double>::FastFourierTransform_cpu(int packetSize)
+: _log2Size(std::ceil(std::log(packetSize) / std::log(2))),
+_packetSize(static_cast<int>(std::pow(2, _log2Size))),
+_pimpl(new FastFourierTransformPImpl<double>(packetSize))
+{}
+
+template <>
+FastFourierTransform_cpu<float>::FastFourierTransform_cpu(int packetSize)
+: _log2Size(std::ceil(std::log(packetSize) / std::log(2))),
+_packetSize(static_cast<int>(std::pow(2, _log2Size))),
+_pimpl(new FastFourierTransformPImpl<float>(packetSize))
+{}
+
+template <typename Dtype>
+FastFourierTransform_cpu<Dtype>::~FastFourierTransform_cpu() {}
+
+template <>
+int FastFourierTransform_cpu<double>::process(double* input_data,
+                                          double* output_data, int size) {
+  CHECK_LE(size, _packetSize);
+
+  // Apply window to data
+  caffe_copy(size, input_data, &_pimpl->buffer[0]);
+  caffe_mul(size, &_pimpl->buffer[0], &_pimpl->window[0],
+            &_pimpl->buffer[0]);
+
+  fftw_plan plan = fftw_plan_dft_r2c_1d(size, &_pimpl->buffer[0],
+                    reinterpret_cast<fftw_complex*>(&_pimpl->buffer[0]),
+                    FFTW_ESTIMATE);
+  CHECK(plan) << "Could not create FFT plan.";
+  fftw_execute(plan);
+  fftw_destroy_plan(plan);
+
+  // Normalize data
+  caffe_cvnrm(reinterpret_cast<std::complex<double>*>(&_pimpl->buffer[0]),
+              &_pimpl->buffer[0], size / 2);
+  caffe_scal(size, 1.0 / size, &_pimpl->buffer[0]);
+
+  if (output_data) {
+    caffe_copy(size / 2, &_pimpl->buffer[0], output_data);
+  } else {
+    caffe_copy(size / 2, &_pimpl->buffer[0], input_data);
+  }
+
+  return size;
+}
+
+template <>
+int FastFourierTransform_cpu<float>::process(float* input_data,
+                                            float* output_data, int size) {
+  CHECK_LE(size, _packetSize);
+
+  // Apply window to data
+  caffe_copy(size, input_data, &_pimpl->buffer[0]);
+  caffe_mul(size, &_pimpl->buffer[0], &_pimpl->window[0],
+            &_pimpl->buffer[0]);
+
+  fftwf_plan plan = fftwf_plan_dft_r2c_1d(size, &_pimpl->buffer[0],
+                   reinterpret_cast<fftwf_complex*>(&_pimpl->buffer[0]),
+                   FFTW_ESTIMATE);
+  CHECK(plan) << "Could not create FFT plan.";
+  fftwf_execute(plan);
+  fftwf_destroy_plan(plan);
+
+  // Normalize data
+  caffe_cvnrm(reinterpret_cast<std::complex<float>*>(&_pimpl->buffer[0]),
+              &_pimpl->buffer[0], size / 2);
+  caffe_scal(size, 1.0f / size, &_pimpl->buffer[0]);
+
+  if (output_data) {
+    caffe_copy(size / 2, &_pimpl->buffer[0], output_data);
+  } else {
+    caffe_copy(size / 2, &_pimpl->buffer[0], input_data);
+  }
+
+  return size;
+}
+
+template <>
+void hammingWindow(float* data, int size) {
+  const float alpha = 0.54;
+  const float beta = 0.46;
+
+  for (int i = 0; i < size; ++i) {
+      data[i] = alpha - (beta * (2 * M_PI * i / (size - 1)));
+  }
+}
+
+template <>
+void hammingWindow(double* data, int size) {
+  const double alpha = 0.54;
+  const double beta = 0.46;
+
+  for (int i = 0; i < size; ++i) {
+      data[i] = alpha - (beta * (2 * M_PI * i / (size - 1)));
+  }
+}
+
+INSTANTIATE_CLASS(FastFourierTransform_cpu);
+
+}  // namespace caffe
+#endif  // USE_AUDIO
diff --git a/src/caffe/util/fft.cu b/src/caffe/util/fft.cu
new file mode 100644
index 00000000000..17a65bd3129
--- /dev/null
+++ b/src/caffe/util/fft.cu
@@ -0,0 +1,92 @@
+#ifdef USE_AUDIO
+#include "caffe/util/fft.hpp"
+
+#include <cufft.h>
+#include <cmath>
+
+#include "caffe/common.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+struct FastFourierTransformPImpl {
+    cufftHandle plan;
+};
+
+template <>
+FastFourierTransform_gpu<double>::FastFourierTransform_gpu(int packetSize)
+: _log2Size(std::ceil(std::log(packetSize) / std::log(2))),
+_packetSize(static_cast<int>(std::pow(2, _log2Size))),
+_pimpl(new FastFourierTransformPImpl<double>())
+{}
+
+template <>
+FastFourierTransform_gpu<float>::FastFourierTransform_gpu(int packetSize)
+: _log2Size(std::ceil(std::log(packetSize) / std::log(2))),
+_packetSize(static_cast<int>(std::pow(2, _log2Size))),
+_pimpl(new FastFourierTransformPImpl<float>())
+{}
+
+template <typename Dtype>
+FastFourierTransform_gpu<Dtype>::~FastFourierTransform_gpu() {}
+
+template <>
+int FastFourierTransform_gpu<double>::process(double* input_data,
+                                              double* output_data, int size) {
+  CHECK_EQ(size, _packetSize);
+
+  if (output_data) {
+    caffe_copy(size, input_data, output_data);
+  } else {
+    output_data = input_data;
+  }
+
+  CHECK_EQ(cufftPlan1d(&(_pimpl->plan), size, CUFFT_D2Z, 1), CUFFT_SUCCESS)
+                                                << "Creation of plan failed.";
+  CHECK_EQ(cufftExecD2Z(_pimpl->plan,
+                        reinterpret_cast<cufftDoubleReal*>(output_data),
+                        reinterpret_cast<cufftDoubleComplex*>(output_data)),
+                        CUFFT_SUCCESS) << "Execution of cuFFT failed.";
+  CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess)
+                                              << "CUDA failed to synchronize.";
+  CHECK_EQ(cufftDestroy(_pimpl->plan), CUFFT_SUCCESS)
+                                              << "Failed to destroy cuFFT.";
+  caffe_cvnrm(reinterpret_cast<std::complex<double>*>(output_data),
+                       output_data, size);
+  caffe_scal(size, 1.0 / size, output_data);
+
+  return size;
+}
+
+template <>
+int FastFourierTransform_gpu<float>::process(float* input_data,
+                                                float* output_data, int size) {
+  CHECK_EQ(size, _packetSize);
+
+  if (output_data) {
+    caffe_copy(size, input_data, output_data);
+  } else {
+    output_data = input_data;
+  }
+
+  CHECK_EQ(cufftPlan1d(&(_pimpl->plan), size, CUFFT_R2C, 1), CUFFT_SUCCESS)
+                                << "Creation of plan failed.";
+  CHECK_EQ(cufftExecR2C(_pimpl->plan, reinterpret_cast<cufftReal*>(output_data),
+                                reinterpret_cast<cufftComplex*>(output_data)),
+                                CUFFT_SUCCESS) << "Execution of cuFFT failed.";
+  CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess)
+                                << "CUDA failed to synchronize.";
+  CHECK_EQ(cufftDestroy(_pimpl->plan), CUFFT_SUCCESS)
+                                << "Failed to destroy cuFFT.";
+  caffe_cvnrm(reinterpret_cast<std::complex<float>*>(output_data),
+                     output_data, size);
+  caffe_scal(size, 1.0f / size, output_data);
+
+  return size;
+}
+
+INSTANTIATE_CLASS(FastFourierTransform_gpu);
+
+}  // namespace caffe
+#endif  // USE_AUDIO
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 0aab6b17b85..781fdc447f3 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -1,6 +1,7 @@
 #include <boost/math/special_functions/next.hpp>
 #include <boost/random.hpp>
 
+#include <functional>
 #include <limits>
 
 #include "caffe/common.hpp"
@@ -68,6 +69,20 @@ template void caffe_set<int>(const int N, const int alpha, int* Y);
 template void caffe_set<float>(const int N, const float alpha, float* Y);
 template void caffe_set<double>(const int N, const double alpha, double* Y);
 
+template <>
+void caffe_cvnrm(std::complex<double>* input, double* output, int size) {
+  for (int i = 0; i < size; ++i) {
+    output[i] = std::abs(input[i]);
+  }
+}
+
+template <>
+void caffe_cvnrm(std::complex<float>* input, float* output, int size) {
+  for (int i = 0; i < size; ++i) {
+    output[i] = std::abs(input[i]);
+  }
+}
+
 template <>
 void caffe_add_scalar(const int N, const float alpha, float* Y) {
   for (int i = 0; i < N; ++i) {
diff --git a/src/caffe/util/read_audio.cpp b/src/caffe/util/read_audio.cpp
new file mode 100644
index 00000000000..36457cdb96f
--- /dev/null
+++ b/src/caffe/util/read_audio.cpp
@@ -0,0 +1,82 @@
+#ifdef USE_AUDIO
+#include <sndfile.h>
+
+#include <string>
+#include <valarray>
+
+#include "caffe/common.hpp"
+#include "caffe/util/read_audio.hpp"
+
+
+namespace caffe {
+
+    int ReadAudioFile(const std::string& filePath, float* data, int capacity,
+                      int offset) {
+        SF_INFO info = SF_INFO();
+
+        SNDFILE* file = sf_open(filePath.c_str(), SFM_READ, &info);
+        CHECK_EQ(sf_error(file), SF_ERR_NO_ERROR) << "Can't open file '"
+          << filePath << "': " << sf_strerror(file);
+
+        sf_count_t status = sf_seek(file, offset, SEEK_SET);
+        CHECK_NE(status, -1) << "Can't seek to offset in: '" << filePath <<
+          "': " << sf_strerror(file);
+
+        sf_count_t numberOfFrames;
+        if (info.channels != 1) {
+          //  Non-mono audio files will only have first channel read
+          std::valarray<float> tempData(info.channels * capacity);
+          numberOfFrames = sf_read_float(file, &tempData[0], tempData.size());
+          for (int i = 0; i < numberOfFrames / info.channels; ++i) {
+            data[i] = tempData[i * info.channels];
+          }
+        } else {
+          numberOfFrames = sf_read_float(file, data, capacity);
+        }
+
+        CHECK_EQ(numberOfFrames / info.channels, capacity) <<
+          "File could not fill provided array";
+
+        status = sf_close(file);
+        CHECK_EQ(status, 0) << "Failed to close file: ''" << filePath << "': "
+          << sf_strerror(file);
+
+        return numberOfFrames;
+    }
+
+    int ReadAudioFile(const std::string& filePath, double* data, int capacity,
+        int offset) {
+        SF_INFO info = SF_INFO();
+
+        SNDFILE* file = sf_open(filePath.c_str(), SFM_READ, &info);\
+        CHECK_EQ(sf_error(file), SF_ERR_NO_ERROR) << "Can't open file '" <<
+          filePath << "': " << sf_strerror(file);
+
+        sf_count_t status = sf_seek(file, offset, SEEK_SET);
+        CHECK_NE(status, -1) << "Can't seek to offset in: '" << filePath <<
+          "': " << sf_strerror(file);
+
+        sf_count_t numberOfFrames;
+        if (info.channels != 1) {
+          //  Non-mono audio files will only have first channel read
+          std::valarray<double> tempData(info.channels * capacity);
+          numberOfFrames = sf_read_double(file, &tempData[0], tempData.size());
+          for (int i = 0; i < numberOfFrames / info.channels; ++i) {
+            data[i] = tempData[i * info.channels];
+          }
+        } else {
+          numberOfFrames = sf_read_double(file, data, capacity);
+        }
+
+        CHECK_EQ(numberOfFrames / info.channels, capacity) <<
+          "File could not fill provided array";
+
+        status = sf_close(file);
+        CHECK_EQ(status, 0) << "Failed to close file: ''" << filePath << "': "
+          << sf_strerror(file);
+
+        return numberOfFrames;
+    }
+
+}  // namespace caffe
+#endif  // USE_AUDIO