Merge pull request #30 from ReadAlongs/release_040

Final 0.4.0 release updates
ReadAlongs · Nov 9, 2022 · 9bbfdf3 · 9bbfdf3
2 parents 562024d + e8bd798
commit 9bbfdf3
Show file tree

Hide file tree

Showing 13 changed files with 103 additions and 115 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,9 +1,7 @@
 include CMakeLists.txt
 include LICENSE
-include README.manylinux.md
 include README.md
 include TODO.md
-include build_wheels.sh
 include config.h.in
 include requirements.dev.txt
 include docs/Makefile
@@ -17,10 +15,15 @@ include docs/source/readme.js.rst
 include docs/source/readme.rst
 include docs/source/soundswallower.rst
 include include/soundswallower/CMakeLists.txt
-include include/soundswallower/*.h
-include js/CMakeLists.txt
+recursive-include include *.h
 include js/README.md
+include js/.npmignore
+include js/*.txt
 include js/*.js
+include js/*.ts
+include js/*.html
+include js/*.py
+include js/*.c
 include js/*.json
 recursive-include model *
 include py/CMakeLists.txt
@@ -35,23 +38,27 @@ include py/test/test_fsg.py
 include pyproject.toml
 include setup.py
 include src/CMakeLists.txt
-include src/*.c
-include src/*.h
-include src/*.y
-include src/*.l
+recursive-include src *.c
+recursive-include src *.h
+recursive-include src *.y
+recursive-include src *.l
 include tests/CMakeLists.txt
 include tests/*.test
 include tests/*.res
 include tests/*.c
+include tests/*.sh
+include tests/testfuncs.sh.in
+include tests/test_macros.h.in
 include tests/compare_table.pl
 recursive-include tests/data *
-include tests/test_macros.h.in
 exclude MANIFEST.in
 exclude .readthedocs.yml
 exclude .travis.yml
 exclude .gitignore
 recursive-exclude .github *
 recursive-exclude _skbuild *
+recursive-exclude build *
+recursive-exclude jsbuild *
 recursive-exclude * .gitignore
 recursive-exclude * *.py[co]
 recursive-exclude * *~ 

diff --git a/README.manylinux.md b/README.manylinux.md
diff --git a/build_wheels.sh b/build_wheels.sh
diff --git a/js/README.md b/js/README.md
@@ -124,23 +124,29 @@ await decoder.initialize();
 
 The optional `loglevel` and `backtrace` options will make it a bit
 more verbose, so you can be sure it's actually doing something.  Now
-we will create the world's stupidest grammar, which recognizes one
-sentence:
+we will create and enable the world's stupidest grammar, which
+recognizes one sentence:
 
 ```js
-let fsg = decoder.create_fsg("goforward", 0, 4, [
+await decoder.set_fsg("goforward", 0, 4, [
     {from: 0, to: 1, prob: 1.0, word: "go"},
     {from: 1, to: 2, prob: 1.0, word: "forward"},
     {from: 2, to: 3, prob: 1.0, word: "ten"},
     {from: 3, to: 4, prob: 1.0, word: "meters"}
 ]);
-await decoder.set_fsg(fsg);
 ```
 
-You should `delete()` it, unless of course you intend to create a
-bunch of them and swap them in and out.  It is also possible to parse
-a grammar in [JSGF](https://en.wikipedia.org/wiki/JSGF) format, see
-below for an example.
+If you actually want to just recognize a single sentence, in order to
+get time alignments (this is known as "force-alignment"), we have a
+better method for you:
+
+```js
+await decoder.set_align_text("go forward ten meters");
+```
+
+It is also possible to parse a grammar in
+[JSGF](https://en.wikipedia.org/wiki/JSGF) format, see below for an
+example.
 
 Okay, let's wreck a nice beach!  Record yourself saying something,
 preferably the sentence "go forward ten meters", using SoX, for
@@ -171,6 +177,23 @@ console.log(decoder.get_hyp());
 console.log(decoder.get_hypseg());
 ```
 
+If you want even more detailed segmentation (phone and HMM state
+level) you can use `get_alignment_json`.  For more detail on this
+format, see [the PocketSphinx
+documentation](https://github.com/cmusphinx/pocketsphinx#usage) as it
+is borrowed from there.  Since this is JSON, you can create an object
+from it and iterate over it:
+
+```js
+const result = JSON.parse(await decoder.get_alignment_json());
+for (const word of result.w) {
+    console.log(`word ${word.t} at ${word.b} has duration ${word.d}`);
+    for (const phone of word.w) {
+        console.log(`phone ${phone.t} at ${phone.b} has duration ${phone.d}`);
+    }
+}
+```
+
 Finally, if your program is long-running and you think you might make
 multiple recognizers, you ought to delete them, because JavaScript is
 awful:
@@ -210,18 +233,6 @@ await require('soundswallower')(ssjs);
 This is simply concatenated to the model name, so you should make sure
 to include the trailing slash, e.g. "model/" and not "model"!
 
-Currently, it should also support any Sphinx format acoustic model, many of
-which are available for download at [the SourceForge
-page](https://sourceforge.net/projects/cmusphinx/files/Acoustic%20and%20Language%20Models/).
-
-To use a module, pass the directory (or base URL) containing its files
-(i.e. `means`, `variances`, etc) in the `hmm` property when
-initializing the decoder, for example:
-
-```js
-const decoder = ssjs.Decoder({hmm: "https://example.com/excellent-acoustic-model/"});
-```
-
 
 Using grammars
 --------------
@@ -231,7 +242,7 @@ from a JavaScript string and set it in the decoder like this (a
 hypothetical pizza-ordering grammar):
 
 ```js
-    let fsg = decoder.parse_jsgf(`#JSGF V1.0;
+    await decoder.set_jsgf(`#JSGF V1.0;
 grammar pizza;
 public <order> = [<greeting>] [<want>] [<quantity>] [<size>] [pizza] <toppings>;
 <greeting> = hi | hello | yo | howdy;
@@ -241,7 +252,6 @@ public <order> = [<greeting>] [<want>] [<quantity>] [<size>] [pizza] <toppings>;
 <toppings> = [with] <topping> ([and] <topping>)*;
 <topping> = olives | mushrooms | tomatoes | (green | hot) peppers | pineapple;
 `);
-    await decoder.set_fsg(fsg);
 ```
 
 Note that all the words in the grammar must first be defined in the
@@ -257,3 +267,32 @@ the internal state.
     await decoder.add_word("supercalifragilisticexpialidocious",
 	    "S UW P ER K AE L IH F R AE JH IH L IH S T IH K EH K S P IY AE L IH D OW SH Y UH S");
 ```
+
+Voice activity detection / Endpointing
+--------------------------------------
+
+This is a work in progress, but it is also possible to detect the
+start and end of speech in an input stream using an `Endpointer`
+object.  This requires you to pass buffers of a specific size, which
+is understandably difficult since WebAudio also only wants to *give*
+you buffers of a specific (and entirely different) size.  A better
+example is forthcoming but it looks a bit like this (copied directly
+from [the
+documentation](https://soundswallower.readthedocs.io/en/latest/soundswallower.js.html#Endpointer.get_in_speech):
+
+```js
+let prev_in_speech = ep.get_in_speech();
+let frame_size = ep.get_frame_size();
+// Presume `frame` is a Float32Array of frame_size or less
+let speech;
+if (frame.size < frame_size)
+    speech = ep.end_stream(frame);
+else
+    speech = ep.process(frame);
+if (speech !== null) {
+    if (!prev_in_speech)
+        console.log("Speech started at " + ep.get_speech_start());
+    if (!ep.get_in_speech())
+        console.log("Speech ended at " + ep.get_speech_end());
+}
+```
diff --git a/js/package.json b/js/package.json
@@ -4,9 +4,9 @@
    "description": "An even smaller speech recognizer",
    "main": "soundswallower.js",
    "scripts": {
-       "test": "make && mocha test_node",
-       "tstest": "make && npx tsc && node test_typescript",
-       "webtest": "make && xdg-open http://localhost:8000/test_web.html && python server.py"
+       "test": "mocha test_node",
+       "tstest": "npx tsc && node test_typescript",
+       "webtest": "xdg-open http://localhost:8000/test_web.html && python server.py"
    },
    "repository": {
       "type": "git",

diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -1,5 +1,4 @@
-scikit-build~=0.13
-Cython~=0.29.21
-pytest~=7.1.2
-build~=0.8.0
+scikit-build
+Cython
+pytest
 numpy
diff --git a/tests/test_acmod.c b/tests/test_acmod.c
@@ -36,7 +36,7 @@ main(int argc, char *argv[])
     config_t *config;
     FILE *rawfh;
     int16 *buf;
-    int16 const *bptr;
+    int16 *bptr;
     mfcc_t **cepbuf, **cptr;
     size_t nread, nsamps;
     fe_t *fe;

diff --git a/tests/test_acmod_grow.c b/tests/test_acmod_grow.c
@@ -37,7 +37,7 @@ main(int argc, char *argv[])
     feat_t *fcb;
     FILE *rawfh;
     int16 *buf;
-    int16 const *bptr;
+    int16 *bptr;
     size_t nread, nsamps;
     int nfr;
     int frame_counter;

diff --git a/tests/test_fe.c b/tests/test_fe.c
@@ -125,10 +125,10 @@ create_shifted(fe_t *fe, int16 *data, size_t nsamp)
 }
 
 mfcc_t **
-create_full(fe_t *fe, const int16 *data, size_t nsamp)
+create_full(fe_t *fe, int16 *data, size_t nsamp)
 {
     mfcc_t **cepbuf;
-    const int16 *inptr;
+    int16 *inptr;
     int rv, nfr, ncep;
 
     TEST_EQUAL(0, fe_start(fe));
@@ -155,10 +155,10 @@ create_full(fe_t *fe, const int16 *data, size_t nsamp)
 }
 
 mfcc_t **
-create_process_frames(fe_t *fe, const int16 *data, size_t nsamp)
+create_process_frames(fe_t *fe, int16 *data, size_t nsamp)
 {
     mfcc_t **cepbuf;
-    const int16 *inptr;
+    int16 *inptr;
     int i, rv, nfr, ncep, frame_shift, frame_size;
 
     fe_get_input_size(fe, &frame_shift, &frame_size);
@@ -195,10 +195,10 @@ create_process_frames(fe_t *fe, const int16 *data, size_t nsamp)
 
 
 mfcc_t **
-create_fragments(fe_t *fe, const int16 *data, size_t nsamp)
+create_fragments(fe_t *fe, int16 *data, size_t nsamp)
 {
     mfcc_t **cepbuf, **cepptr;
-    const int16 *inptr;
+    int16 *inptr;
     int i, rv, nfr, ncep, frame_shift, frame_size;
     /* Should total 1024 :) */
     size_t fragments[] = {

diff --git a/tests/test_fe_float32.c b/tests/test_fe_float32.c
@@ -127,10 +127,10 @@ create_shifted(fe_t *fe, float32 *data, size_t nsamp)
 }
 
 mfcc_t **
-create_full(fe_t *fe, const float32 *data, size_t nsamp)
+create_full(fe_t *fe, float32 *data, size_t nsamp)
 {
     mfcc_t **cepbuf;
-    const float32 *inptr;
+    float32 *inptr;
     int rv, nfr, ncep;
 
     TEST_EQUAL(0, fe_start(fe));
@@ -157,10 +157,10 @@ create_full(fe_t *fe, const float32 *data, size_t nsamp)
 }
 
 mfcc_t **
-create_process_frames(fe_t *fe, const float32 *data, size_t nsamp)
+create_process_frames(fe_t *fe, float32 *data, size_t nsamp)
 {
     mfcc_t **cepbuf;
-    const float32 *inptr;
+    float32 *inptr;
     int i, rv, nfr, ncep, frame_shift, frame_size;
 
     fe_get_input_size(fe, &frame_shift, &frame_size);
@@ -197,10 +197,10 @@ create_process_frames(fe_t *fe, const float32 *data, size_t nsamp)
 
 
 mfcc_t **
-create_fragments(fe_t *fe, const float32 *data, size_t nsamp)
+create_fragments(fe_t *fe, float32 *data, size_t nsamp)
 {
     mfcc_t **cepbuf, **cepptr;
-    const float32 *inptr;
+    float32 *inptr;
     int i, rv, nfr, ncep, frame_shift, frame_size;
     /* Should total 1024 :) */
     size_t fragments[] = {
@@ -238,11 +238,11 @@ create_fragments(fe_t *fe, const float32 *data, size_t nsamp)
 
 
 mfcc_t **
-create_mixed_fragments(fe_t *fe, const float32 *data, const int16 *idata, size_t nsamp, int odd)
+create_mixed_fragments(fe_t *fe, float32 *data, int16 *idata, size_t nsamp, int odd)
 {
     mfcc_t **cepbuf, **cepptr;
-    const float32 *inptr;
-    const int16 *iinptr;
+    float32 *inptr;
+    int16 *iinptr;
     int i, rv, nfr, ncep, frame_shift, frame_size;
     /* Should total 1024 :) */
     size_t fragments[] = {

diff --git a/tests/test_feat_fe.c b/tests/test_feat_fe.c
@@ -54,7 +54,7 @@ main(int argc, char *argv[])
 	cptr = cepbuf;
 	nfr = total_frames;
 	while ((nsamp = fread(buf, sizeof(int16), 2048, raw)) > 0) {
-		int16 const *bptr = buf;
+		int16 *bptr = buf;
 		while (nsamp) {
 			int ncep = fe_process_int16(fe, &bptr, &nsamp,
                                                     cptr, nfr);