Skip to content

Commit

Permalink
Prepare for v1.0.1 (#24)
Browse files Browse the repository at this point in the history
* prepare for next release

* fixed style and removed unused file
  • Loading branch information
bdwyer2 authored Feb 14, 2019
1 parent f2f8b4d commit a090174
Show file tree
Hide file tree
Showing 8 changed files with 358 additions and 457 deletions.
5 changes: 3 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ services:
install:
- docker build -t max-audio-classifier .
- docker run -it -d -p 5000:5000 max-audio-classifier
- sleep 30
- pip install pytest requests flake8
before_script:
- pip install pytest requests
- flake8 . --max-line-length=127
- sleep 30
script:
- pytest tests/test.py
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ COPY requirements.txt /workspace
RUN pip install -r requirements.txt

COPY . /workspace
RUN md5sum -c md5sums.txt # check file integrity

# check file integrity
RUN md5sum -c md5sums.txt

EXPOSE 5000

Expand Down
4 changes: 2 additions & 2 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
# Application settings

# API metadata
API_TITLE = 'Model Asset Exchange Server'
API_DESC = 'An API for serving models'
API_TITLE = 'MAX Audio Classifier'
API_DESC = 'Identify sounds in short audio clips.'
API_VERSION = '0.1'

# default model
Expand Down
324 changes: 162 additions & 162 deletions core/mel_features.py

Large diffs are not rendered by default.

103 changes: 0 additions & 103 deletions core/vggish_inference_demo.py

This file was deleted.

83 changes: 42 additions & 41 deletions core/vggish_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,51 +23,52 @@
from . import vggish_params
import sys


def waveform_to_examples(data, sample_rate):
"""Converts audio waveform into an array of examples for VGGish.
"""Converts audio waveform into an array of examples for VGGish.
Args:
data: np.array of either one dimension (mono) or two dimensions
(multi-channel, with the outer dimension representing channels).
Each sample is generally expected to lie in the range [-1.0, +1.0],
although this is not required.
sample_rate: Sample rate of data.
Args:
data: np.array of either one dimension (mono) or two dimensions
(multi-channel, with the outer dimension representing channels).
Each sample is generally expected to lie in the range [-1.0, +1.0],
although this is not required.
sample_rate: Sample rate of data.
Returns:
3-D np.array of shape [num_examples, num_frames, num_bands] which represents
a sequence of examples, each of which contains a patch of log mel
spectrogram, covering num_frames frames of audio and num_bands mel frequency
bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
"""
# Convert to mono.
if len(data.shape) > 1:
data = np.mean(data, axis=1)
# Resample to the rate assumed by VGGish.
if sample_rate != vggish_params.SAMPLE_RATE:
data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)
Returns:
3-D np.array of shape [num_examples, num_frames, num_bands] which represents
a sequence of examples, each of which contains a patch of log mel
spectrogram, covering num_frames frames of audio and num_bands mel frequency
bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
"""
# Convert to mono.
if len(data.shape) > 1:
data = np.mean(data, axis=1)
# Resample to the rate assumed by VGGish.
if sample_rate != vggish_params.SAMPLE_RATE:
data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

# Compute log mel spectrogram features.
log_mel = mel_features.log_mel_spectrogram(
data,
audio_sample_rate=vggish_params.SAMPLE_RATE,
log_offset=vggish_params.LOG_OFFSET,
window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
num_mel_bins=vggish_params.NUM_MEL_BINS,
lower_edge_hertz=vggish_params.MEL_MIN_HZ,
upper_edge_hertz=vggish_params.MEL_MAX_HZ)
# Compute log mel spectrogram features.
log_mel = mel_features.log_mel_spectrogram(
data,
audio_sample_rate=vggish_params.SAMPLE_RATE,
log_offset=vggish_params.LOG_OFFSET,
window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
num_mel_bins=vggish_params.NUM_MEL_BINS,
lower_edge_hertz=vggish_params.MEL_MIN_HZ,
upper_edge_hertz=vggish_params.MEL_MAX_HZ)

# Frame features into examples.
features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
example_window_length = int(round(
vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
example_hop_length = int(round(
vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
log_mel_examples = mel_features.frame(
log_mel,
window_length=example_window_length,
hop_length=example_hop_length)
return log_mel_examples
# Frame features into examples.
features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
example_window_length = int(round(
vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
example_hop_length = int(round(
vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
log_mel_examples = mel_features.frame(
log_mel,
window_length=example_window_length,
hop_length=example_hop_length)
return log_mel_examples


def wavfile_to_examples(wav_file):
Expand All @@ -82,7 +83,7 @@ def wavfile_to_examples(wav_file):
"""
try:
sr, wav_data = wavfile.read(wav_file)
except:
except IOError:
print("Error reading WAV file!")
print("The specified WAV file type is not supported by scipy.io.wavfile.read()")
sys.exit(1)
Expand Down
118 changes: 59 additions & 59 deletions core/vggish_postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,71 +21,71 @@


class Postprocessor(object):
"""Post-processes VGGish embeddings.
"""Post-processes VGGish embeddings.
The initial release of AudioSet included 128-D VGGish embeddings for each
segment of AudioSet. These released embeddings were produced by applying
a PCA transformation (technically, a whitening transform is included as well)
and 8-bit quantization to the raw embedding output from VGGish, in order to
stay compatible with the YouTube-8M project which provides visual embeddings
in the same format for a large set of YouTube videos. This class implements
the same PCA (with whitening) and quantization transformations.
"""
The initial release of AudioSet included 128-D VGGish embeddings for each
segment of AudioSet. These released embeddings were produced by applying
a PCA transformation (technically, a whitening transform is included as well)
and 8-bit quantization to the raw embedding output from VGGish, in order to
stay compatible with the YouTube-8M project which provides visual embeddings
in the same format for a large set of YouTube videos. This class implements
the same PCA (with whitening) and quantization transformations.
"""

def __init__(self, pca_params_npz_path):
"""Constructs a postprocessor.
def __init__(self, pca_params_npz_path):
"""Constructs a postprocessor.
Args:
pca_params_npz_path: Path to a NumPy-format .npz file that
contains the PCA parameters used in postprocessing.
"""
params = np.load(pca_params_npz_path)
self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
# Load means into a column vector for easier broadcasting later.
self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
assert self._pca_matrix.shape == (
vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
'Bad PCA means shape: %r' % (self._pca_means.shape,))
Args:
pca_params_npz_path: Path to a NumPy-format .npz file that
contains the PCA parameters used in postprocessing.
"""
params = np.load(pca_params_npz_path)
self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
# Load means into a column vector for easier broadcasting later.
self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
assert self._pca_matrix.shape == (
vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
'Bad PCA means shape: %r' % (self._pca_means.shape,))

def postprocess(self, embeddings_batch):
"""Applies postprocessing to a batch of embeddings.
def postprocess(self, embeddings_batch):
"""Applies postprocessing to a batch of embeddings.
Args:
embeddings_batch: An nparray of shape [batch_size, embedding_size]
containing output from the embedding layer of VGGish.
Args:
embeddings_batch: An nparray of shape [batch_size, embedding_size]
containing output from the embedding layer of VGGish.
Returns:
An nparray of the same shape as the input but of type uint8,
containing the PCA-transformed and quantized version of the input.
"""
assert len(embeddings_batch.shape) == 2, (
'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
'Bad batch shape: %r' % (embeddings_batch.shape,))
Returns:
An nparray of the same shape as the input but of type uint8,
containing the PCA-transformed and quantized version of the input.
"""
assert len(embeddings_batch.shape) == 2, (
'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
'Bad batch shape: %r' % (embeddings_batch.shape,))

# Apply PCA.
# - Embeddings come in as [batch_size, embedding_size].
# - Transpose to [embedding_size, batch_size].
# - Subtract pca_means column vector from each column.
# - Premultiply by PCA matrix of shape [output_dims, input_dims]
# where both are are equal to embedding_size in our case.
# - Transpose result back to [batch_size, embedding_size].
pca_applied = np.dot(self._pca_matrix,
(embeddings_batch.T - self._pca_means)).T
# Apply PCA.
# - Embeddings come in as [batch_size, embedding_size].
# - Transpose to [embedding_size, batch_size].
# - Subtract pca_means column vector from each column.
# - Premultiply by PCA matrix of shape [output_dims, input_dims]
# where both are are equal to embedding_size in our case.
# - Transpose result back to [batch_size, embedding_size].
pca_applied = np.dot(self._pca_matrix,
(embeddings_batch.T - self._pca_means)).T

# Quantize by:
# - clipping to [min, max] range
clipped_embeddings = np.clip(
pca_applied, vggish_params.QUANTIZE_MIN_VAL,
vggish_params.QUANTIZE_MAX_VAL)
# - convert to 8-bit in range [0.0, 255.0]
quantized_embeddings = (
(clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
(255.0 /
(vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
# - cast 8-bit float to uint8
quantized_embeddings = quantized_embeddings.astype(np.uint8)
# Quantize by:
# - clipping to [min, max] range
clipped_embeddings = np.clip(
pca_applied, vggish_params.QUANTIZE_MIN_VAL,
vggish_params.QUANTIZE_MAX_VAL)
# - convert to 8-bit in range [0.0, 255.0]
quantized_embeddings = (
(clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
(255.0 /
(vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
# - cast 8-bit float to uint8
quantized_embeddings = quantized_embeddings.astype(np.uint8)

return quantized_embeddings
return quantized_embeddings
Loading

0 comments on commit a090174

Please sign in to comment.