From 28d3ab406d29a444efbb64cee9b926bec4f744fd Mon Sep 17 00:00:00 2001
From: caleb crome <caleb@crome.org>
Date: Wed, 22 Dec 2021 09:45:49 -0800
Subject: [PATCH] Added multi-pin TDM support.

This allows you use use more than one TX or RX pin in TDM mode,
allowing for up to 64 channels of record or playback data

This fix goes hand-in-hand with the multi-channel USB support.
https://github.com/PaulStoffregen/cores/pull/732

input_tdm: every odd channel had every other sample swapped
In every odd channel in TDM input (1, 3, 5, 7, 9, 11, 13, 15), every
other word was swapped due to an incorrect copy from 32-bits to
16-bits. This fix corrects the odd channels.   The Shift-by zeros  and the
extraneous logical ands are there for clarity, and I verified they
don't end up affecting final code optimization as long as optimization
is turned on.
---
 input_tdm.cpp  | 109 +++++++++++++++++++++++++++++++++----------------
 input_tdm.h    |   5 ++-
 input_tdm2.cpp |   4 +-
 output_tdm.cpp |  17 ++++++--
 output_tdm.h   |   1 +
 5 files changed, 94 insertions(+), 42 deletions(-)

diff --git a/input_tdm.cpp b/input_tdm.cpp
index 77c572724..e693fdd67 100644
--- a/input_tdm.cpp
+++ b/input_tdm.cpp
@@ -31,17 +31,15 @@
 #include "utility/imxrt_hw.h"
 
 DMAMEM __attribute__((aligned(32)))
-static uint32_t tdm_rx_buffer[AUDIO_BLOCK_SAMPLES*16];
-audio_block_t * AudioInputTDM::block_incoming[16] = {
-	nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-	nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr
-};
+static uint32_t tdm_rx_buffer[AUDIO_BLOCK_SAMPLES*TDM_IN_N_AUDIO_BLOCKS];
+audio_block_t * AudioInputTDM::block_incoming[TDM_IN_N_AUDIO_BLOCKS];
 bool AudioInputTDM::update_responsibility = false;
 DMAChannel AudioInputTDM::dma(false);
 
-
 void AudioInputTDM::begin(void)
 {
+	for (int i = 0; i < TDM_IN_N_AUDIO_BLOCKS; i++)
+		block_incoming[i] = nullptr;
 	dma.begin(true); // Allocate the DMA channel first
 
 	// TODO: should we set & clear the I2S_RCSR_SR bit here?
@@ -68,7 +66,22 @@ void AudioInputTDM::begin(void)
 	dma.attachInterrupt(isr);
 #elif defined(__IMXRT1062__)
 	CORE_PIN8_CONFIG  = 3;  //RX_DATA0
-	IOMUXC_SAI1_RX_DATA0_SELECT_INPUT = 2;
+	IOMUXC_SAI1_RX_DATA0_SELECT_INPUT = 2; // TEENSY P8, GPIO_B1_00_ALT3  as rx 0
+        // When using combine mode, you *must* use the data pins in order.
+	switch (AUDIO_N_SAI1_RX_DATAPINS) {
+        case 4: 
+            CORE_PIN32_CONFIG  = 3;  //RX_DATA3
+            IOMUXC_SAI1_RX_DATA3_SELECT_INPUT = 1; // TEENSY P32, GPIO_B0_12_ALT3 as rx 1
+        case 3:
+            CORE_PIN9_CONFIG  = 3;  //RX_DATA2
+            IOMUXC_SAI1_RX_DATA2_SELECT_INPUT = 1; // TEENSY P9, GPIO_B0_11_ALT3 as rx 2
+        case 2:
+            CORE_PIN6_CONFIG  = 3;  //RX_DATA1
+            IOMUXC_SAI1_RX_DATA1_SELECT_INPUT = 1; // TEENSY P6, GPIO_B0_10_ALT3 as rx 1
+            break;
+        default:
+            break;
+	}
 	dma.TCD->SADDR = &I2S1_RDR0;
 	dma.TCD->SOFF = 0;
 	dma.TCD->ATTR = DMA_TCD_ATTR_SSIZE(2) | DMA_TCD_ATTR_DSIZE(2);
@@ -89,25 +102,57 @@ void AudioInputTDM::begin(void)
 #endif	
 }
 
-// TODO: needs optimization...
-static void memcpy_tdm_rx(uint32_t *dest1, uint32_t *dest2, const uint32_t *src)
-{
-	uint32_t i, in1, in2;
-
-	for (i=0; i < AUDIO_BLOCK_SAMPLES/2; i++) {
-		in1 = *src;
-		in2 = *(src+8);
-		src += 16;
-		*dest1++ = (in1 >> 16) | (in2 & 0xFFFF0000);
-		*dest2++ = (in1 << 16) | (in2 & 0x0000FFFF);
-	}
+// Since we're grabbing data out of the fifo at 32-bits, but the words
+// are 16-bits and we're sharing fifos in a round-robbin manner, with
+//
+// 1 FIFO (pin) active, the word/channel order for the first frame is
+// C00 C02 C04 C06 C08 C10 C12 C14
+// C01 C03 C05 C07 C09 C11 C13 C15
+//
+// 2 FIFO (pin) active, the word/channel order for the first frame is
+// C00 C16 C02 C18 C04 C20 C06 C22 C08 C24 C10 C26 C12 C28 C14 C30 
+// C01 C17 C03 C19 C05 C21 C07 C23 C09 C25 C11 C27 C13 C29 C15 C31 
+//
+// 3 FIFO (pin) active, the word/channel order for the first frame is
+// C00 C16 C32 C02 C18 C34 C04 C20 C36 C06 C22 C38 C08 C24 C40 C10 C26 C42 C12 C28 C44 C14 C30 C46
+// C01 C17 C33 C03 C19 C35 C05 C21 C37 C07 C23 C39 C09 C25 C41 C11 C27 C43 C13 C29 C45 C15 C31 C47
+//
+// 4 FIFO (pin) active, the word/channel order for the first frame is
+// C00 C16 C32 C48|C02 C18 C34 C50|C04 C20 C36 C52|C06 C22 C38 C54|C08 C24 C40 C56|C10 C26 C42 C58|C12 C28 C44 C60|C14 C30 C46 C62
+// C01 C17 C33 C49|C03 C19 C35 C51|C05 C21 C37 C53|C07 C23 C39 C55|C09 C25 C41 C57|C11 C27 C43 C59|C13 C29 C45 C61|C15 C31 C47 C63 
+//  0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31 <--- word index
+// where each column is a single 32-bit word popped from the fifo
+static void deinterleave(audio_block_t *block_incoming[TDM_IN_N_AUDIO_BLOCKS], const uint32_t *src) {
+    int l = 0;
+    for (int i=0; i < TDM_IN_N_AUDIO_BLOCKS/2; i++) {
+        // ix = 0,  2,  4,  6,          ... 1 pin
+        // ix = 0, 16,  2, 18,  3, 20, ... 2 pins
+        // ix = 0, 16, 32,  2, 18, 33 ... 3 pins
+        // ix = 0, 16, 32, 48,  2, 18 ... 4 pins
+
+        //  i = 0,  1,  2,  3,  4,  5,
+        // ix = 0, 16, 32,  2, 18, 33 ... 3 pins
+        //    = (0*16 1*16 2*16)+i
+        int ix = (l*16) + i/AUDIO_N_SAI1_RX_DATAPINS*2;
+        l = (l + 1) % AUDIO_N_SAI1_RX_DATAPINS;
+        uint32_t *dest1 = (uint32_t *)(block_incoming[ix]->data);
+        uint32_t *dest2 = (uint32_t *)(block_incoming[ix+1]->data);
+        const uint32_t *psrc = src;
+        for (int j=0; j < AUDIO_BLOCK_SAMPLES/2; j++) {
+            uint32_t in1 = *psrc;
+            uint32_t in2 = *(psrc + TDM_IN_N_AUDIO_BLOCKS * AUDIO_N_SAI1_RX_DATAPINS / 2);
+            psrc += TDM_IN_N_AUDIO_BLOCKS;
+            *dest1++ = ((in1 >> 16) & 0x0000FFFF) | ((in2 <<  0) & 0xFFFF0000);
+            *dest2++ = ((in1 >>  0) & 0x0000FFFF) | ((in2 << 16) & 0xFFFF0000);
+        }
+        src++;
+    }
 }
 
 void AudioInputTDM::isr(void)
 {
 	uint32_t daddr;
 	const uint32_t *src;
-	unsigned int i;
 
 	daddr = (uint32_t)(dma.TCD->DADDR);
 	dma.clearInterrupt();
@@ -115,7 +160,7 @@ void AudioInputTDM::isr(void)
 	if (daddr < (uint32_t)tdm_rx_buffer + sizeof(tdm_rx_buffer) / 2) {
 		// DMA is receiving to the first half of the buffer
 		// need to remove data from the second half
-		src = &tdm_rx_buffer[AUDIO_BLOCK_SAMPLES*8];
+		src = &tdm_rx_buffer[AUDIO_BLOCK_SAMPLES * TDM_IN_N_AUDIO_BLOCKS / 2];
 	} else {
 		// DMA is receiving to the second half of the buffer
 		// need to remove data from the first half
@@ -125,12 +170,7 @@ void AudioInputTDM::isr(void)
 		#if IMXRT_CACHE_ENABLED >=1
 		arm_dcache_delete((void*)src, sizeof(tdm_rx_buffer) / 2);
 		#endif
-		for (i=0; i < 16; i += 2) {
-			uint32_t *dest1 = (uint32_t *)(block_incoming[i]->data);
-			uint32_t *dest2 = (uint32_t *)(block_incoming[i+1]->data);
-			memcpy_tdm_rx(dest1, dest2, src);
-			src++;
-		}
+                deinterleave(block_incoming, src);
 	}
 	if (update_responsibility) update_all();
 }
@@ -138,12 +178,11 @@ void AudioInputTDM::isr(void)
 
 void AudioInputTDM::update(void)
 {
-	unsigned int i, j;
-	audio_block_t *new_block[16];
-	audio_block_t *out_block[16];
-
-	// allocate 16 new blocks.  If any fails, allocate none
-	for (i=0; i < 16; i++) {
+    unsigned int i, j;
+	audio_block_t *new_block[TDM_IN_N_AUDIO_BLOCKS];
+	audio_block_t *out_block[TDM_IN_N_AUDIO_BLOCKS];
+	// allocate TDM_IN_N_AUDIO_BLOCKS new blocks.  If any fails, allocate none
+	for (i=0; i < TDM_IN_N_AUDIO_BLOCKS; i++) {
 		new_block[i] = allocate();
 		if (new_block[i] == nullptr) {
 			for (j=0; j < i; j++) {
@@ -158,8 +197,8 @@ void AudioInputTDM::update(void)
 	memcpy(block_incoming, new_block, sizeof(block_incoming));
 	__enable_irq();
 	if (out_block[0] != nullptr) {		
-		// if we got 1 block, all 16 are filled
-		for (i=0; i < 16; i++) {
+		// if we got 1 block, all TDM_IN_N_AUDIO_BLOCKS are filled
+		for (i=0; i < TDM_IN_N_AUDIO_BLOCKS; i++) {
 			transmit(out_block[i], i);
 			release(out_block[i]);
 		}
diff --git a/input_tdm.h b/input_tdm.h
index 585c6d0f3..9647a1cfc 100644
--- a/input_tdm.h
+++ b/input_tdm.h
@@ -30,6 +30,9 @@
 #include <Arduino.h>     // github.com/PaulStoffregen/cores/blob/master/teensy4/Arduino.h
 #include <AudioStream.h> // github.com/PaulStoffregen/cores/blob/master/teensy4/AudioStream.h
 #include <DMAChannel.h>  // github.com/PaulStoffregen/cores/blob/master/teensy4/DMAChannel.h
+#include "AudioRate.h"
+
+#define TDM_IN_N_AUDIO_BLOCKS (TDM_CHANNELS_PER_PIN*AUDIO_N_SAI1_RX_DATAPINS)
 
 class AudioInputTDM : public AudioStream
 {
@@ -42,7 +45,7 @@ class AudioInputTDM : public AudioStream
 	static DMAChannel dma;
 	static void isr(void);
 private:
-	static audio_block_t *block_incoming[16];
+	static audio_block_t *block_incoming[TDM_IN_N_AUDIO_BLOCKS];
 };
 
 #endif
diff --git a/input_tdm2.cpp b/input_tdm2.cpp
index ae3ebdc33..fe8d2fcc4 100644
--- a/input_tdm2.cpp
+++ b/input_tdm2.cpp
@@ -79,8 +79,8 @@ static void memcpy_tdm_rx(uint32_t *dest1, uint32_t *dest2, const uint32_t *src)
 		in1 = *src;
 		in2 = *(src+8);
 		src += 16;
-		*dest1++ = (in1 >> 16) | (in2 & 0xFFFF0000);
-		*dest2++ = (in1 << 16) | (in2 & 0x0000FFFF);
+		*dest1++ = ((in1 >> 16) & 0x0000FFFF) | ((in2 <<  0) & 0xFFFF0000);
+		*dest2++ = ((in1 >>  0) & 0x0000FFFF) | ((in2 << 16) & 0xFFFF0000);
 	}
 }
 
diff --git a/output_tdm.cpp b/output_tdm.cpp
index 8f3a6158c..5d1210a1a 100644
--- a/output_tdm.cpp
+++ b/output_tdm.cpp
@@ -23,7 +23,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-
 #include <Arduino.h>
 
 #if !defined(KINETISL)
@@ -319,20 +318,30 @@ void AudioOutputTDM::config_tdm(void)
 	I2S1_TCR1 = I2S_TCR1_RFW(4);
 	I2S1_TCR2 = I2S_TCR2_SYNC(tsync) | I2S_TCR2_BCP | I2S_TCR2_MSEL(1)
 		| I2S_TCR2_BCD | I2S_TCR2_DIV(0);
-	I2S1_TCR3 = I2S_TCR3_TCE;
+
+    // enable all TX data pins.  Channels must be used in order, so
+    // enabled chanels can (0b0001, 0b0011, 0b0111, or 0b1111) only.  This
+    // limitation is because of the fifo combine mode limitation.
+    I2S1_TCR3 = ((1 << AUDIO_N_SAI1_TX_DATAPINS)-1) << 16;
+
 	I2S1_TCR4 = I2S_TCR4_FRSZ(7) | I2S_TCR4_SYWD(0) | I2S_TCR4_MF
 		| I2S_TCR4_FSE | I2S_TCR4_FSD;
+    I2S1_TCR4 |= (AUDIO_N_SAI1_TX_DATAPINS > 1) ? I2S_TCR4_FCOMB_ENABLED_ON_WRITES : 0;
 	I2S1_TCR5 = I2S_TCR5_WNW(31) | I2S_TCR5_W0W(31) | I2S_TCR5_FBT(31);
 
 	I2S1_RMR = 0;
 	I2S1_RCR1 = I2S_RCR1_RFW(4);
 	I2S1_RCR2 = I2S_RCR2_SYNC(rsync) | I2S_TCR2_BCP | I2S_RCR2_MSEL(1)
 		| I2S_RCR2_BCD | I2S_RCR2_DIV(0);
-	I2S1_RCR3 = I2S_RCR3_RCE;
+        // enable all RX data pins.  Channels must be used in order, so
+        // enabled chanels can (0b0001, 0b0011, 0b0111, or 0b1111) only.  This
+        // limitation is because of the fifo combine mode limitation.
+        I2S1_RCR3 = ( (1 << AUDIO_N_SAI1_RX_DATAPINS) - 1 ) << 16;
+
 	I2S1_RCR4 = I2S_RCR4_FRSZ(7) | I2S_RCR4_SYWD(0) | I2S_RCR4_MF
 		| I2S_RCR4_FSE | I2S_RCR4_FSD;
+        I2S1_RCR4 |= (AUDIO_N_SAI1_RX_DATAPINS > 1) ? I2S_RCR4_FCOMB_ENABLED_ON_READS : 0;
 	I2S1_RCR5 = I2S_RCR5_WNW(31) | I2S_RCR5_W0W(31) | I2S_RCR5_FBT(31);
-
 	CORE_PIN23_CONFIG = 3;  //1:MCLK
 	CORE_PIN21_CONFIG = 3;  //1:RX_BCLK
 	CORE_PIN20_CONFIG = 3;  //1:RX_SYNC
diff --git a/output_tdm.h b/output_tdm.h
index d019002af..0055b5583 100644
--- a/output_tdm.h
+++ b/output_tdm.h
@@ -30,6 +30,7 @@
 #include <Arduino.h>     // github.com/PaulStoffregen/cores/blob/master/teensy4/Arduino.h
 #include <AudioStream.h> // github.com/PaulStoffregen/cores/blob/master/teensy4/AudioStream.h
 #include <DMAChannel.h>  // github.com/PaulStoffregen/cores/blob/master/teensy4/DMAChannel.h
+#include "AudioRate.h"
 
 class AudioOutputTDM : public AudioStream
 {