microsoft · AnirudhBHarish · Jul 2, 2021 · Oct 12, 2021 · Oct 21, 2021 · Oct 21, 2021
diff --git a/.gitattributes b/.gitattributes
@@ -60,3 +60,14 @@ c_reference/models/q_scut_head_b_face4_model/mbconv2.h filter=lfs diff=lfs merge
 c_reference/models/q_scut_head_b_face4_model/mbconv4.h filter=lfs diff=lfs merge=lfs -text
 c_reference/models/q_scut_head_b_face4_model/rnn2.h filter=lfs diff=lfs merge=lfs -text
 c_reference/models/q_scut_head_b_face4_model/detection2.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/kws/keyword_spotting_io_1.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/kws/keyword_spotting_io_2.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/kws/keyword_spotting_io_3.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/conv1d/conv1d_regular/conv_param.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/conv1d/conv1d_lr/conv_param_lr.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/conv1d/conv1d_depthwise/conv_param_depth.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/kws/precnn_params.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/kws/postcnn_params.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/kws/rnn_params.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/rnn_bricked/rnn_params.h filter=lfs diff=lfs merge=lfs -text
+c_reference/tests/rnn_bricked/rnn_bricked_io.h filter=lfs diff=lfs merge=lfs -text
diff --git a/c_reference/include/conv1d.h b/c_reference/include/conv1d.h
diff --git a/c_reference/include/dscnn.h b/c_reference/include/dscnn.h
@@ -0,0 +1,104 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#ifndef __DSCNN_H__
+#define __DSCNN_H__
+
+// Function pointer for the Conv layer to be passed as a parameter. (conv1d or conv1d_lr only).
+typedef int (*conv_layer)(float*, unsigned, unsigned, const float*, 
+                          unsigned, unsigned, unsigned, unsigned, 
+                          const void*, unsigned, unsigned);
+
+/**
+ * @brief Model definition for the 1D Convolution block applied before the RNN.
+ * @brief sub-layers : batchnorm1d -> conv1d_lr.
+ * @param[out]   output_signal       pointer to the final output signal, minimum size = out_time * in_channels. out_time has to be calculated based on the reduction from all the conv and pool layers.
+ * @param[in]    input_signal        pointer to the input signal. size = in_time * in_channels.
+ * @param[in]    cnn                 function pointer for the CNN layer. (any of the conv layers can be passed with appropriate params).
+ * @param[in]    in_time             number of time steps in the input_signal.
+ * @param[in]    in_channels         number of input channels.
+ * @param[in]    mean                pointer to the mean for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2.
+ * @param[in]    var                 pointer to the variance for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2.
+ * @param[in]    affine_config       whether the affine operations are applied.
+ *                                   if affine_config = 0, then only mean and var are used.
+ *                                   if affine_config = 1, then mean, var, gamma and beta are used for the final computation.
+ *                                   if affine_config = 2, then only the gamma and beta are used. gamma = original_gamma/sqrt(var), beta = original_beta - gamma * mean/sqrt(var).
+ *                                   Note: Use affine_config = 2 for faster calculations. The new gamma and beta would need to be pre-computed, stored and passed.
+ * @param[in]    gamma               pointer to the scaling factors for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0.
+ * @param[in]    beta                pointer to the offsets for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0.
+ * @param[in]    in_place            in-place computation check for the batchnorm. Storage efficient.
+ * @param[in]    cnn_hidden          hidden state/out_channels dimensions for the low-rank CNN. The final channel size of this block.
+ * @param[in]    cnn_padding         padding for the low-rank CNN layer. Note: applied to both sides of the input.
+ * @param[in]    cnn_kernel_size     kernel size of the low-rank CNN.
+ * @param[in]    cnn_params          weights, bias and other essential parameters for the low-rank CNN.
+ * @param[in]    cnn_stride          stride factor for the low-rank CNN.
+ * @param[in]    cnn_activation      an integer to choose the type of activation function.
+ *                                   0: none.
+ *                                   1: sigmoid.
+ *                                   2: tanh.
+ *                                   3: relu.
+ */
+int phon_pred_lr_cnn(float* output_signal, float* input_signal,
+  conv_layer cnn, unsigned in_time, unsigned in_channels,
+  const float* const mean, const float* const var,
+  unsigned affine_config, const float* const gamma, const float* const beta, unsigned in_place,
+  unsigned cnn_hidden, unsigned cnn_padding, unsigned cnn_kernel_size,
+  const void* cnn_params, unsigned cnn_stride, unsigned cnn_activation);
+
+/**
+ * @brief Model definition for the 1D Convolution block applied after the RNN.
+ * @brief sub-layers : custom nonlinearity(semi_sigmoid_tanh) -> batchnorm1d -> conv1d_depth -> conv1d_lr -> avgpool1d.
+ * @param[out]   output_signal          pointer to the final output signal, minimum size = out_time * in_channels. out_time has to be calculated based on the reduction from all the conv and pool layers.
+ * @param[in]    input_signal           pointer to the input signal. size = in_time * in_channels.
+ * @param[in]    point_cnn              function pointer for the point-wise CNN. (any of the conv layers can be passed with appropriate params).
+ * @param[in]    in_time                number of time steps in the input.
+ * @param[in]    in_channels            number of input channels.
+ * @param[in]    mean                   pointer to the mean for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2.
+ * @param[in]    var                    pointer to the variance for the batch normalization, size = in_channels. Pass NULL/0 for affine_config = 2.
+ * @param[in]    affine_config          whether the affine operations are applied.
+ *                                      if affine_config = 0, then only mean and var are used.
+ *                                      if affine_config = 1, then mean, var, gamma and beta are used for the final computation.
+ *                                      if affine_config = 2, then only the gamma and beta are used. gamma = original_gamma/sqrt(var), beta = original_beta - gamma * mean/sqrt(var).
+ *                                      Note: Use affine_config = 2 for faster calculations. The new gamma and beta would need to be pre-computed, stored and passed.
+ * @param[in]    gamma                  pointer to the scaling factors for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0.
+ * @param[in]    beta                   pointer to the offsets for the post-norm affine operation, size = in_channels. Pass NULL/0 for affine_config = 0.
+ * @param[in]    in_place               in-place computation of the batchnorm. Storage efficient.
+ * @param[in]    depth_cnn_padding      padding for the depth CNN layer. Note: applied to both sides of the input to the depth CNN.
+ * @param[in]    depth_cnn_kernel_size  kernel size of the depth CNN.
+ * @param[in]    depth_cnn_params       weights, bias and other essential parameters used to describe the depth CNN.
+ * @param[in]    depth_cnn_stride       stride factor for the depth CNN.
+ * @param[in]    depth_cnn_activation   an integer to choose the type of activation function.
+ *                                      0: none.
+ *                                      1: sigmoid.
+ *                                      2: tanh.
+ *                                      3: relu.
+ * @param[in]    point_cnn_hidden       hidden state/out_channels dimensions for the point CNN. The final channel size of this block.
+ * @param[in]    point_cnn_padding      padding for the point CNN layer. Note: applied to both sides of the input to the point CNN.
+ * @param[in]    point_cnn_kernel_size  kernel size of the point CNN.
+ * @param[in]    point_cnn_params       weights, bias and other essential parameters used to describe the point CNN.
+ * @param[in]    point_cnn_stride       stride factor for the point CNN.
+ * @param[in]    point_cnn_activation   an integer to choose the type of activation function.
+ *                                      0: none.
+ *                                      1: sigmoid.
+ *                                      2: tanh.
+ *                                      3: relu.
+ * @param[in]    pool_padding           padding for the pool layer. Note: applied to both sides of the input to the pool.
+ * @param[in]    pool_kernel_size       kernel size of the pool.
+ * @param[in]    pool_stride            stride factor for the pool.
+ * @param[in]    pool_activation        an integer to choose the type of activation function.
+ *                                      0: none.
+ *                                      1: sigmoid.
+ *                                      2: tanh.
+ *                                      3: relu.
+ */
+int phon_pred_depth_point_lr_cnn(float* output_signal, float* input_signal,
+  conv_layer point_cnn, unsigned in_time, unsigned in_channels,
+  const float* const mean, const float* const var,
+  unsigned affine_config, const float* const gamma, const float* const beta, unsigned in_place,
+  unsigned depth_cnn_padding, unsigned depth_cnn_kernel_size,
+  const void* depth_cnn_params, unsigned depth_cnn_stride, unsigned depth_cnn_activation,
+  unsigned point_cnn_hidden, unsigned point_cnn_padding, unsigned point_cnn_kernel_size,
+  const void* point_cnn_params, unsigned point_cnn_stride, unsigned point_cnn_activation,
+  unsigned pool_padding, unsigned pool_kernel_size, unsigned pool_stride, unsigned pool_activation);
+
+#endif
diff --git a/c_reference/include/rnn_bricked.h b/c_reference/include/rnn_bricked.h
@@ -0,0 +1,105 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT license.
+
+#ifndef __RNN_BRICKED_H__
+#define __RNN_BRICKED_H__
+
+/*  All the matrices are stored in the row major format.
+    
+    NOTES for using the layers.
+->  Single-directional Computation.
+    While using the bricked fastgrnn layers, the user needs to adhered to the two following constraints.
+    1) in_time % hop = 0.
+    2) fwd_window % hop = 0 and bwd_window % hop = 0.
+
+    Violation of the above two constraints (1 & 2), will cause segmentation faults.
+    The layers first compute all the Wx steps and then compute Uh for all the windows parallelly.
+    Hence, the user needs to adhered to the constraints 1 & 2.
+
+->  Bi-directional Computation.
+    For bi-directional cases, there are 2 additionally constraints that would need to be followed.
+    A) sample_first_brick and sample_last_brick = 1.
+    B) An offset of rnn_hidden would need to be given to the output_signal pointer during the backward function call.
+        Each function will only process its given context(forward/backward). The other context will need to be called separately.
+        E.g : 1st step -> forward(output, ..., input, ..., bi-direction=1, ...).
+              2nd step -> backward(output + rnn_hidden, ..., input, ..., bi-direction=1, ...).
+
+    The two extra constraints (A & B) are only for bi-directional cases and can be ignored if only forward (or only backward) is used.
+    Violating the conditions would cause index mis-matches or data corruption.
+    If the first (last) brick is not sampled, the first few (last few) time steps would be missing in the forward (backward) result .
+    If the offset is not passed during the backward function call, the backward pass will overwrite the forward result (bi-directional case only).
+*/
+
+/**
+ * @brief Model parameters for the 1D Convolution Layer.
+ * @var   W1                     pointer to first low-rank component of W. shape = [rank * in_dims].
+ * @var   W2                     pointer to second low-rank component of W. shape = [rnn_hidden * rank].
+ * @var   wRank                  rank of W matrix.
+ * @var   U1                     pointer to first low-rank component of U. shape = [rank * rnn_hidden].
+ * @var   U2                     pointer to second low-rank component of U. shape = [rnn_hidden * rank].
+ * @var   uRank                  rank of U matrix.
+ * @var   Bg                     pointer to bias for sigmoid.
+ * @var   Bh                     pointer to bias for tanh.
+ * @var   sigmoid_zeta           first weight parameter for update from input from next step.
+ * @var   sigmoid_nu             second weight parameter for update from input from next step.
+ * @var   block_size_w_to_lr     block/tile size for the cache. Used for tiled MatMul. For W1 * x.
+ * @var   block_size_w_from_lr   block/tile size for the cache. Used for tiled MatMul. For W2 * result(W1 * x).
+ * @var   block_size_u_to_lr     block/tile size for the cache. Used for tiled MatMul. For U1 * h.
+ * @var   block_size_u_from_lr   block/tile size for the cache. Used for tiled MatMul. For U2 * result(U1 * h).
+ */
+typedef struct BrickedFastGRNN_LR_Params {
+  float* W1; 
+  float* W2;
+  unsigned wRank;
+  float* U1;
+  float* U2; 
+  unsigned uRank;
+  float* Bg; 
+  float* Bh;
+  float sigmoid_zeta;
+  float sigmoid_nu;
+  unsigned block_size_w_to_lr;
+  unsigned block_size_w_from_lr;
+  unsigned block_size_u_to_lr;
+  unsigned block_size_u_from_lr;
+} BrickedFastGRNN_LR_Params;
+
+/** Forward Bricking and application of the forward RNN for an input signal.
+ * @param[out]       output_signal        pointer to output signal. size = out_time * rnn_hidden.
+ * @param[in]        rnn_hidden           output dimension for the current cell.
+ * @param[in]        input_signal         pointer to input signal. size = in_time * in_dims.
+ * @param[in]        in_time              number of input time steps.
+ * @param[in]        in_dims              input dimensions.
+ * @param[in]        window               window length for each brick. For the final brick, the left over time steps are used(need not be window in length for the last brick).
+ * @param[in]        hop                  hop distance for between bricks.
+ * @param[in]        params               pointer to the parameters for the RNN.
+ * @param[in]        bi_direction         determine if the ouput if for a bi-directional RNN. 
+ * @param[in]        sample_first_brick   determine if the 1st brick should also be sampled.
+ *                                        -> if = 0, only the last hidden state of each brick is sampled. out_time = (in_time-window)/hop + 1.
+ *                                        -> if = 1, for the 1st brick, we sample every hop index(similar to ::hop). For all the bricks(including the 1st) we sample the final hiddens state. out_time = in_time/hop + 1.
+ */
+int forward_bricked_fastgrnn_lr(float* output_signal, unsigned rnn_hidden, 
+  float* input_signal, unsigned in_time, unsigned in_dims, 
+  unsigned window, unsigned hop, const void* params,
+  unsigned bi_direction, unsigned sample_first_brick);
+
+/** Backward Bricking and application of the backward RNN for an input signal.
+ * @param[out]       output_signal        pointer to output signal. size = out_time * rnn_hidden.
+ * @param[in]        rnn_hidden           output dimension for the current cell.
+ * @param[in]        input_signal         pointer to input signal. size = in_time * in_dims.
+ * @param[in]        in_time              number of input time steps.
+ * @param[in]        in_dims              input dimensions.
+ * @param[in]        window               window length for each brick. For the final brick, the left over time steps are used(need not be window in length for the last brick).
+ * @param[in]        hop                  hop distance for between bricks.
+ * @param[in]        params               pointer to the parameters for the RNN.
+ * @param[in]        bi_direction         determine if the ouput if for a bi-directional RNN. 
+ * @param[in]        sample_last_brick    determine if the last brick should also be sampled
+ *                                        -> if = 0, only the first(last in reverse) hidden state of each brick is sampled. out_time = (in_time-window)/hop + 1.
+ *                                        -> if = 1, for the last brick, we sample every hop index in reverse(similar to ::hop in reverse). For all the bricks(including the last) we sample the first hiddens state(last in reverse). out_time = in_time/hop + 1.
+ */
+int backward_bricked_fastgrnn_lr(float* output_signal, unsigned rnn_hidden, 
+  float* input_signal, unsigned in_time, unsigned in_dims, 
+  unsigned window, unsigned hop, const void* params,
+  unsigned bi_direction, unsigned sample_last_brick);
+
+#endif