KWS Fix (#225)

MJ10 · web-flow · commit c7349dd67ede · 2021-01-07T14:51:34.000-08:00
* FastGRNNCUDA: batch_first fixes

* FastGRNNCUDA: docstring fix

* add note for cuda cell installation in README

* fix KWS training

* add kws demo

* fix typos
diff --git a/examples/pytorch/FastCells/KWS-training/README.md b/examples/pytorch/FastCells/KWS-training/README.md
@@ -65,9 +65,21 @@ python examples/pytorch/FastCells/train_classifier.py \
 	--lr_min 0.0005 --lr_scheduler CosineAnnealingLR --lr_peaks 0
 ```
 Drop the `--rolling` and `--max_rolling_length` options if you are going to run inference on 1 second clips,
-and do not plan to stream data through the model without resettting.
+and do not plan to stream data through the model without resetting. `$MODEL_DIR` should be set to the output path of the model. The training script will generate
+the following files in the output directory: `FastGRNN128KeywordSpotter.pt`, `FastGRNN128KeywordSpotter.onnx`, `mean.npy`, `std.npy` and a few other `.txt` files, along with a `config.json` in the current directory. Note: The names of the `.pt` and `.onnx` file may change based on the parameters passed for training.
+
+#### Run a demo with the model
+To evaluate the model on desktop, use the demo script in the directory. The demo script requires some additional dependencies:
+```
+pip install pyaudio python_speech_features
+```
+
+```bash
+python kws_demo.py --config_path <path_to_config.json> --model_path <path_to_model.pt> --mean_path <path_to_mean.npy> --std_path <path_to_std.npy>
+```
 
 ### Convert .onnx model to .ell IR
+Replace `model.onnx` with the name of the `.onnx` file generated after training in this as well as the following sections.
 ```
 pip install onnx #If you haven't already
 python $ELL_ROOT/tools/importers/onnx/onnx_import.py output_model/model.onnx
diff --git a/examples/pytorch/FastCells/KWS-training/kws-demo.py b/examples/pytorch/FastCells/KWS-training/kws-demo.py
@@ -0,0 +1,214 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+from threading import Thread
+from queue import Queue
+from sys import byteorder
+from array import array
+from struct import pack
+from collections import Counter
+import argparse
+
+import pyaudio
+import wave
+ 
+import numpy as np
+from edgeml_pytorch.graph.rnn import SRNN2
+from scipy.io import wavfile
+from python_speech_features import fbank
+import torch
+import time
+import os
+import pdb
+ 
+from training_config import TrainingConfig
+from train_classifier import create_model
+ 
+CLASS_LABELS = {
+    1: 'backward', 
+    2: 'bed',
+    3: 'bird',
+    4: 'cat',
+    5: 'dog',
+    6: 'down',
+    7: 'eight',
+    8: 'five',
+    9: 'follow',
+    10: 'forward',
+    11: 'four',
+    12: 'go',
+    13: 'happy',
+    14: 'house',
+    15: 'learn',
+    16: 'left',
+    17: 'marvin',
+    18: 'nine',
+    19: 'no',
+    20: 'off',
+    21: 'on',
+    22: 'one',
+    23: 'right',
+    24: 'seven',
+    25: 'sheila',
+    26: 'six',
+    27: 'stop',
+    28: 'three',
+    29: 'tree',
+    30: 'two',
+    31: 'up',
+    32: 'visual',
+    33: 'wow',
+    34: 'yes',
+    35: 'zero'
+}
+
+# Audio Recording Parameters
+FORMAT = pyaudio.paInt16
+RATE = 16000
+ 
+# SRNN Parameters
+maxlen = 16000
+num_filt = 32
+samplerate = 16000
+winlen = 0.025
+save_file = False
+winstep = 0.010
+
+winstepSamples = winstep * samplerate
+winlenSamples = winlen * samplerate
+numSteps = int(np.ceil((maxlen - winlenSamples)/winstepSamples) + 1)
+
+# Streaming Prediction Parameters
+num_windows = 10
+majority = 5
+stride = int(50 * (samplerate / 1000))
+CHUNK_SIZE = stride
+queue = Queue(10000000)
+
+
+def extract_features(audio_data, data_len, num_filters,
+                        sample_rate, window_len, window_step):
+    """
+    Returns MFCC features for input `audio_data`.
+    """
+    featurized_data = []
+    eps = 1e-10
+    for sample in audio_data:
+        # temp = [num_steps, num_filters]
+        temp, _ = fbank(sample, samplerate=sample_rate, winlen=window_len,
+                        winstep=window_step, nfilt=num_filters,
+                        winfunc=np.hamming)
+        temp = np.log(temp + eps)
+        featurized_data.append(temp)
+    return np.array(featurized_data)
+ 
+class RecordingThread(Thread):
+    def run(self):
+        p = pyaudio.PyAudio()
+        stream = p.open(format=FORMAT, channels=1, rate=RATE,
+            input=True, output=True,
+            frames_per_buffer=CHUNK_SIZE)
+        global queue
+        while True:
+            snd_data = array('h', stream.read(CHUNK_SIZE))
+            if byteorder == 'big':
+                snd_data.byteswap()
+            queue.put(snd_data)
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+ 
+class PredictionThread(Thread):
+    def run(self):
+        global queue
+        global mean
+        global std
+        global fastgrnn
+        global srnn2
+        r = array('h')
+        count = 0
+        prev_class = 0
+        srnn_votes = []
+        fastgrnn_votes = []       
+        while True:
+            data = queue.get()
+            queue.task_done()
+            count += 1
+            r.extend(data)
+            if count < 21:
+                continue
+            
+            r = r[stride:]
+            if save_file:
+                data = pack('<' + ('h'*len(r)), *r)
+                save(data, 2, 'gen_sounds\cont'+str(count)+'.wav')
+            data_np = np.array(r)
+            data_np = np.expand_dims(data_np, 0)
+            features = extract_features(data_np, numSteps, numFilt, samplerate, winlen, winstep)
+            features = (features - mean) / std
+            features = np.swapaxes(features, 0, 1)
+ 
+            logits = fastgrnn(torch.FloatTensor(features))            
+            _, y = torch.max(logits, dim=1)
+            if len(fastgrnn_votes) == num_windows:
+                fastgrnn_votes.pop(0)
+                fastgrnn_votes.append(y.item())
+            else:
+                fastgrnn_votes.append(y.item())
+            
+            if count % 10 == 0:
+                class_id = Counter(fastgrnn_votes).most_common(1)[0][0]
+                class_freq = Counter(fastgrnn_votes).most_common(1)[0][1]
+                if class_id != 0 and class_freq > 7 and prev_class != class_id:
+                    try:
+                        print('Keyword:', CLASS_LABELS[class_id])
+                    except:
+                        pass
+                prev_class = class_id
+ 
+def save(data, sample_width, path):
+    """
+    Saves audio `data` to given path. 
+    """
+    wf = wave.open(path, 'wb')
+    wf.setnchannels(1)
+    wf.setsampwidth(sample_width)
+    wf.setframerate(RATE)
+    wf.writeframes(data)
+    wf.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser("Simple Keyword Spotting Demo")
+    parser.add_argument("--config_path", help="Path to config file", type=str)
+    parser.add_argument("--model_path", help="Path to trained model", type=str)
+    parser.add_argument("--mean_path", help="Path to train dataset mean", type=str)
+    parser.add_argument("--std_path", help="Path to train dataset std", type=str)
+
+    args = parser.parse_args()
+
+    # FastGRNN Parameters
+    config_path = args.config_path
+    fastgrnn_model_path = args.model_path
+    fastgrnn_mean_path = args.mean_path
+    fastgrnn_std_path = args.std_path
+    
+    mean = np.load(fastgrnn_mean_path)
+    std = np.load(fastgrnn_std_path)
+
+    # Load FastGRNN
+    config = TrainingConfig()
+    config.load(config_path)
+    fastgrnn = create_model(config.model, num_filt, 35)
+    fastgrnn.load_state_dict(torch.load(fastgrnn_model_path, map_location=torch.device('cpu')))
+    fastgrnn.normalize(None, None)
+    
+    # Start streaming prediction
+    pred = PredictionThread()
+    rec = RecordingThread()
+    
+    pred.start()
+    rec.start()
+
+    pred.join()
+    rec.join()
diff --git a/pytorch/edgeml_pytorch/graph/rnn.py b/pytorch/edgeml_pytorch/graph/rnn.py
@@ -1509,7 +1509,7 @@ def forward(self, x, brickSize):
 class FastGRNNFunction(Function):
     @staticmethod
     def forward(ctx, input, bias_gate, bias_update, zeta, nu, old_h, w, u, w1, w2, u1, u2, gate_non_linearity):
-        outputs = fastgrnn_cuda.forward(input, w, u, bias_gate, bias_update, zeta, nu, old_h, gate_non_linearity, w1, w2, u1, u2)
+        outputs = fastgrnn_cuda.forward(input.contiguous(), w, u, bias_gate, bias_update, zeta, nu, old_h, gate_non_linearity, w1, w2, u1, u2)
         new_h = outputs[0]
         variables = [input, old_h, zeta, nu, w, u] + outputs[1:] + [w1, w2, u1, u2]
         ctx.save_for_backward(*variables)
@@ -1525,7 +1525,7 @@ def backward(ctx, grad_h):
 class FastGRNNUnrollFunction(Function):
     @staticmethod
     def forward(ctx, input, bias_gate, bias_update, zeta, nu, old_h, w, u, w1, w2, u1, u2, gate_non_linearity):
-        outputs = fastgrnn_cuda.forward_unroll(input, w, u, bias_gate, bias_update, zeta, nu, old_h, gate_non_linearity, w1, w2, u1, u2)
+        outputs = fastgrnn_cuda.forward_unroll(input.contiguous(), w, u, bias_gate, bias_update, zeta, nu, old_h, gate_non_linearity, w1, w2, u1, u2)
         hidden_states = outputs[0]
         variables = [input, hidden_states, zeta, nu, w, u] + outputs[1:] + [old_h, w1, w2, u1, u2]
         ctx.save_for_backward(*variables)
diff --git a/pytorch/edgeml_pytorch/trainer/fastmodel.py b/pytorch/edgeml_pytorch/trainer/fastmodel.py
@@ -177,6 +177,8 @@ def forward(self, input):
             else:
                 for l in range(self.num_layers):
                     rnn = self.rnn_list[l]
+                    if self.hidden_states[l] is not None:
+                        self.hidden_states[l] = self.hidden_states[l].clone().unsqueeze(0)
                     model_output = rnn(rnn_in, hiddenState=self.hidden_states[l])
                     self.hidden_states[l] = model_output.detach()[-1, :, :]
                     if self.tracking: