From fb1bd0b16917113274a5387e219175c330e344f8 Mon Sep 17 00:00:00 2001 From: Sasha Rush Date: Fri, 16 Jul 2021 19:51:31 -0400 Subject: [PATCH 1/5] MFCC speech processing demo --- examples/alignment.dx | 381 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 381 insertions(+) create mode 100644 examples/alignment.dx diff --git a/examples/alignment.dx b/examples/alignment.dx new file mode 100644 index 000000000..308f92937 --- /dev/null +++ b/examples/alignment.dx @@ -0,0 +1,381 @@ +import fft +import parser +import plot + +' # Speech Processing + +' This notebook describes the basic steps behind speech processing. + The goal will be to walk through the process from reading in a wave + file to being ready to train a full speech recognition system. + + +' ## Wave Files + To begin we need to be able to read in .wav files. To do this + we follow the wave file spec and define a header and the raw + data format. + +' [Wave Format Documentation](https://docs.fileformat.com/audio/wav/) + + +data WavHeader = + AsWavHeader {riff:(Fin 4=>Char) & + size:Int & + type:(Fin 4=>Char) & + chunk:(Fin 4=>Char) & + formatlength:Int & + channels:Int & + samplerate:Int & + bitspersample:Int & + datasize:Int} + +data Wav = AsWav header:WavHeader dat:(List Int) + +' ### Binary Manipulation + We will need a couple additional binary conversion functions. (These are naive for now.) + +z = FToI (pow 2.0 15.0) +def W32ToI (x : Word32): Int = + y:Int = internalCast _ x + select (y <= z) y ((-1)*z + (y- z)) + +def W8ToW32 (x : Word8): Word32 = internalCast _ x + +def bytesToInt32 (chunk : Fin 4 => Byte) : Int = + [a, b, c, d] = map W8ToW32 chunk + W32ToI ((d .<<. 24) .|. (c .<<. 16) .|. (b .<<. 8) .|. a) + + +def bytesToInt16 (chunk : Fin 2 => Byte) : Int = + [a, b] = map W8ToW32 chunk + W32ToI ((b .<<. 8) .|. a) + + +' ### Parsing Types + +' We also add a couple additional parser helpers. + +def parseGroup (parser: Parser a) : (Parser (m => a)) = MkParser \h. for i. parse h parser +parse4: Parser (Fin 4=>Char) = parseGroup parseAny +parse2: Parser (Fin 2=>Char) = parseGroup parseAny + +def parse16 : Parser Int = MkParser \h. + bytesToInt16 $ parse h parse2 + +def pString (x:String) : Parser Unit = MkParser \h. + (AsList n ls) = x + iter \i . + case i < n of + True -> + _ = parse h (pChar ls.(i@_)) + Continue + False -> Done () + + +' ### Wave Parsing and IO + +' We define a parser to construct the header and to read in the data. + +wavparser : Parser Wav = MkParser \h. + riff = parse h parse4 + size = bytesToInt32 $ parse h parse4 + type = parse h parse4 + chunk = parse h parse4 + _ = parse h parse16 + formatlength = bytesToInt32 $ parse h parse4 + channels = bytesToInt16 $ parse h parse2 + samplerate = bytesToInt32 $ parse h parse4 + _ = bytesToInt32 $ parse h parse4 + _ = bytesToInt16 $ parse h parse2 + bitspersample = bytesToInt16 $ parse h parse2 + _ = parse h $ pString (AsList 4 ['d', 'a', 't', 'a']) + datasize = bytesToInt32 $ parse h parse4 + x = parse h $ parseMany parse16 + header = AsWavHeader {riff=riff, size=size, + type=type, chunk=chunk, + formatlength=formatlength, + channels=channels, samplerate=samplerate, + bitspersample=bitspersample, + datasize=datasize} + AsWav header x + + +' We add a helper function to read in the whole file. + +def fullstream (stream : Stream ReadMode) : {IO} String = + yieldState (AsList _ []) \results. + iter \_. + str = fread stream + (AsList n _) = str + results := (get results) <> str + case n == 0 of + True -> Done () + False -> Continue + +x = unsafeIO do runParserPartial (withFile "examples/speech2.wav" ReadMode fullstream) wavparser +(AsList len raw_sample_wav) = case x of + Nothing -> mempty + (Just (AsWav header dat)) -> dat + +Datsize = Fin len +samplerate = 8000.0 + + +' ### The Audio + + +' We are going to work with an MNist style of audio that has short snippets of + people saying letters. + +s = unsafeIO do base64Encode (withFile "examples/speech2.wav" ReadMode fullstream) + +:html "