Wren6991 · MichaelBell · Feb 17, 2023
diff --git a/software/apps/vista/CMakeLists.txt b/software/apps/vista/CMakeLists.txt
@@ -10,7 +10,7 @@ target_compile_definitions(vista PRIVATE
 	DVI_DEFAULT_SERIAL_CONFIG=${DVI_DEFAULT_SERIAL_CONFIG}
 	DVI_VERTICAL_REPEAT=1
 	DVI_N_TMDS_BUFFERS=5
-	DVI_SYMBOLS_PER_WORD=1
+	DVI_SYMBOLS_PER_WORD=2
 	)
 
 target_compile_definitions(vista PRIVATE PICO_STACK_SIZE=0x200)

diff --git a/software/apps/vista/main.c b/software/apps/vista/main.c
@@ -40,9 +40,9 @@ struct semaphore dvi_start_sem;
 
 static inline void prepare_scanline(const uint32_t *colourbuf, uint32_t *tmdsbuf) {
 	const uint pixwidth = 640;
-	tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + 0 * pixwidth, pixwidth, 4, 0);
-	tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + 1 * pixwidth, pixwidth, 10, 5);
-	tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + 2 * pixwidth, pixwidth, 15, 11);
+	tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf, pixwidth, 4, 0);
+	tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + (pixwidth >> 1), pixwidth, 10, 5);
+	tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + pixwidth, pixwidth, 15, 11);
 }
 
 void __no_inline_not_in_flash_func(flash_bulk_dma_start)(uint32_t *rxbuf, uint32_t flash_offs, size_t len, uint dma_chan)

diff --git a/software/libdvi/tmds_encode.S b/software/libdvi/tmds_encode.S
@@ -372,24 +372,30 @@ tmds_2bpp_table:
 // Here is an idea
 // Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in
 // ACCUM0), concatenated with the sign bit of our running disparity (from
-// ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential),
-// with the symbol's disparity stored left-justified in the upper 12 bits, as
-// e.g. a 6 bit signed integer.
+// ACCUM1). Each table entry is a 10-bit TMDS symbol, with the symbol's disparity 
+// stored left-justified in the upper bits, as e.g. a 5 bit signed integer.
 //
-// - Load pixel data.                        cyc: 0.75 (ldmia 2 words, every 4 pixels)
+// We have 2 copies of the table, one with the TMDS symbol aligned to the right
+// and one shifted 10 bits left.  This means they can be ORed together to form
+// 2 symbols in one word.
+//
+// - Load pixel data.                        cyc: 0.625 (ldmia 4 words, every 8 pixels)
 // - Write pixel to ACCUM0.                  cyc: 1
 // - Read address from PEEK2.                cyc: 1
 // - Load encoded pixel from address.        cyc: 2
 // - Write disparity data to ACCUM1_ADD      cyc: 1
-// - Write encoded data to output buffer.    cyc: 1.25 (stmia 4 words, every 4 pixels)
+// - OR every 2 pix                          cyc: 0.5
+// - Write encoded data to output buffer.    cyc: 0.625 (stmia 4 words, every 8 pixels)
 //
-// With decent register allocation we may be able to load 4 pixels at
-// once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix.
+// With decent register allocation we can load 8 pixels at once (4 words), 
+// and write 8 at once (4 words). This gives 6.75 cyc/pix.
 //
 // One issue is that the TMDS data in the bottom of ACCUM1 will eventually
-// overflow and affect the running disparity, but with 16 zeroes in between,
-// this would take much longer than one scanline, so everything is fine if
-// we clear the accumulator at the start of the scanline.
+// overflow and affect the running disparity. For the right aligned symbols there are
+// 16 zeroes in between, so this would take much longer than one scanline, so 
+// everything is fine if we clear the accumulator at the start of the scanline.
+// For the shifted symbols this overflows after 128 symbols, so we need to clear the
+// bottom bits of the accumulator more often than that.
 //
 // Note that we need to use two interpolators to get the bits from both pixels
 // -- we are not outputting a single DC-balanced stream, but rather two
@@ -404,19 +410,20 @@ tmds_2bpp_table:
 // much better, and many monitors will still accept the signals as long as you
 // DC couple your DVI signals.
 
-.macro tmds_fullres_encode_loop_body ra rb
-	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
-	str \ra, [r2, #ACCUM0_OFFS]
-	ldr \ra, [r2, #PEEK2_OFFS]
-	ldr \ra, [\ra]
+.macro tmds_fullres_encode_loop_body rd
+	str \rd, [r2, #ACCUM0_OFFS + INTERP1]
+	str \rd, [r2, #ACCUM0_OFFS]
+	ldr \rd, [r2, #PEEK2_OFFS]
+	ldr \rd, [\rd]
 #if !TMDS_FULLRES_NO_DC_BALANCE
-	str \ra, [r2, #ACCUM1_ADD_OFFS]
+	str \rd, [r2, #ACCUM1_ADD_OFFS]
 #endif
-	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
-	ldr \rb, [\rb]
+	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
+	ldr r7, [r7]
 #if !TMDS_FULLRES_NO_DC_BALANCE
-	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
+	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
 #endif
+	orrs \rd, r7
 .endm
 
 // r0: Input buffer (word-aligned)
@@ -429,7 +436,7 @@ tmds_2bpp_table:
 	push {r4}
 
 
-	lsls r2, #2
+	lsls r2, #1
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
@@ -449,12 +456,21 @@ tmds_2bpp_table:
 	b 2f
 	.align 2
 1:
-.rept 16
-	ldmia r0!, {r4, r6}
-	tmds_fullres_encode_loop_body r4 r5
-	tmds_fullres_encode_loop_body r6 r7
-	stmia r1!, {r4, r5, r6, r7}
+.rept 10 // 80 pixels per iteration
+	ldmia r0!, {r3, r4, r5, r6}
+	tmds_fullres_encode_loop_body r3
+	tmds_fullres_encode_loop_body r4
+	tmds_fullres_encode_loop_body r5
+	tmds_fullres_encode_loop_body r6
+	stmia r1!, {r3, r4, r5, r6}
 .endr
+	// Need to clear the bottom of the balance tracker for interp1 as the 
+	// otherwise the shifted TMDS symbols will overflow
+	ldr r7, [r2, #ACCUM1_OFFS + INTERP1]
+	movs r6, #0x1F
+	lsls r6, #27
+	ands r7, r6
+	str r7, [r2, #ACCUM1_OFFS + INTERP1]
 2:
 	cmp r1, ip
 	beq 1f
@@ -472,21 +488,22 @@ decl_func_y tmds_fullres_encode_loop_16bpp_y
 	tmds_fullres_encode_loop_16bpp
 
 
-.macro tmds_fullres_encode_loop_body_leftshift ra rb
+.macro tmds_fullres_encode_loop_body_leftshift rd
 	// Note we apply the leftshift for INTERP0 only
-	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
-	lsls \ra, r3
-	str \ra, [r2, #ACCUM0_OFFS]
-	ldr \ra, [r2, #PEEK2_OFFS]
-	ldr \ra, [\ra]
+	str \rd, [r2, #ACCUM0_OFFS + INTERP1]
+	lsls \rd, r3
+	str \rd, [r2, #ACCUM0_OFFS]
+	ldr \rd, [r2, #PEEK2_OFFS]
+	ldr \rd, [\rd]
 #if !TMDS_FULLRES_NO_DC_BALANCE
-	str \ra, [r2, #ACCUM1_ADD_OFFS]
+	str \rd, [r2, #ACCUM1_ADD_OFFS]
 #endif
-	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
-	ldr \rb, [\rb]
+	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
+	ldr r7, [r7]
 #if !TMDS_FULLRES_NO_DC_BALANCE
-	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
+	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
 #endif
+	orrs \rd, r7
 .endm
 
 // r0: Input buffer (word-aligned)
@@ -500,7 +517,7 @@ decl_func_y tmds_fullres_encode_loop_16bpp_y
 	mov r5, r9
 	push {r4-r5}
 
-	lsls r2, #2
+	lsls r2, #1
 	add r2, r1
 	mov ip, r2
 	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
@@ -519,12 +536,24 @@ decl_func_y tmds_fullres_encode_loop_16bpp_y
 	b 2f
 	.align 2
 1:
-.rept 16 // 64 pixels per iteration
-	ldmia r0!, {r4, r6}
-	tmds_fullres_encode_loop_body_leftshift r4 r5
-	tmds_fullres_encode_loop_body_leftshift r6 r7
-	stmia r1!, {r4, r5, r6, r7}
+.rept 13 // 80 pixels per iteration
+	ldmia r0!, {r4, r5, r6}
+	tmds_fullres_encode_loop_body_leftshift r4
+	tmds_fullres_encode_loop_body_leftshift r5
+	tmds_fullres_encode_loop_body_leftshift r6
+	stmia r1!, {r4, r5, r6}
 .endr
+	ldmia r0!, {r4}
+	tmds_fullres_encode_loop_body_leftshift r4
+	stmia r1!, {r4}
+
+	// Need to clear the bottom of the balance tracker for interp1 as the 
+	// otherwise the shifted TMDS symbols will overflow
+	ldr r7, [r2, #ACCUM1_OFFS + INTERP1]
+	movs r6, #0x1F
+	lsls r6, #27
+	ands r7, r6
+	str r7, [r2, #ACCUM1_OFFS + INTERP1]
 2:
 	cmp r1, ip
 	beq 1f

diff --git a/software/libdvi/tmds_encode.c b/software/libdvi/tmds_encode.c
@@ -7,18 +7,24 @@ static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
 #include "tmds_table.h"
 };
 
-// Fullres table is bandwidth-critical, so gets one copy for each scratch
-// memory. There is a third copy which can go in flash, because it's just used
-// to generate palette LUTs. The ones we don't use will get garbage collected
-// during linking.
+// Fullres tables are bandwidth-critical, so gets one copy for each scratch
+// memory.  If we don't use them they will get garbage collected during linking.
 const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
 #include "tmds_table_fullres.h"
 };
 
+const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_shifted_x[] = {
+#include "tmds_table_fullres_shifted.h"
+};
+
 const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
 #include "tmds_table_fullres.h"
 };
 
+const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_shifted_y[] = {
+#include "tmds_table_fullres_shifted.h"
+};
+
 // Configure an interpolator to extract a single colour channel from each of a pair
 // of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being
 // pixel_width wide. Produce a LUT address for the first pixel's colour data on
@@ -147,6 +153,7 @@ void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t
 	// tread on each other's toes too much.
 	const uint32_t *lutbase = core ? tmds_table_fullres_x : tmds_table_fullres_y;
 	int lshift_lower = configure_interp_for_addrgen_fullres(interp0_hw, channel_msb, channel_lsb, 6, lutbase);
+	lutbase = core ? tmds_table_fullres_shifted_x : tmds_table_fullres_shifted_y;
 	int lshift_upper = configure_interp_for_addrgen_fullres(interp1_hw, channel_msb + 16, channel_lsb + 16, 6, lutbase);
 	assert(!lshift_upper); (void)lshift_upper;
 	if (lshift_lower) {

diff --git a/software/libdvi/tmds_table_fullres.h b/software/libdvi/tmds_table_fullres.h
@@ -1,6 +1,6 @@
 // Each entry consists of a 10 bit TMDS symbol in pseudo-differential format
-// (10 LSBs) and the symbol's disparity as a 6 bit signed integer (the 6
-// MSBs). There is a 16 bit gap in between them, which is actually vital for
+// (10 LSBs) and the symbol's disparity as a 5 bit signed integer (the 5
+// MSBs). There is a 17 bit gap in between them, which is actually vital for
 // the way the TMDS encode works!
 //
 // There are 128 1-word entries. The lookup index should be the concatenation

diff --git a/software/libdvi/tmds_table_fullres_shifted.h b/software/libdvi/tmds_table_fullres_shifted.h
@@ -0,0 +1,139 @@
+// Each entry consists of a 10 bit TMDS symbol in pseudo-differential format
+// shifted left by 10 bits, and the symbol's disparity as a 5 bit signed integer 
+// (the 5 MSBs). There is a 7 bit gap in between them, which is actually vital for
+// the way the TMDS encode works!
+//
+// There are 128 1-word entries. The lookup index should be the concatenation
+// of the sign bit of current running disparity, with 6 bits of colour channel
+// data.
+
+// Non-negative running disparity:
+0xe0040000,
+0xf80c0c00,
+0x000c1c00,
+0xe8041000,
+0x0007c000,
+0xf0043000,
+0xe8042000,
+0x000c2c00,
+0xf8078000,
+0xf8047000,
+0xf0046000,
+0x00079000,
+0xe8044000,
+0x000c4c00,
+0x0007a000,
+0xf0090400,
+0xf0070000,
+0x0004f000,
+0xf804e000,
+0xf8071000,
+0xf004c000,
+0x00073000,
+0xf8072000,
+0xf8098400,
+0xe8048000,
+0x000c8c00,
+0x00076000,
+0x0009c400,
+0xf8074000,
+0xf0021800,
+0xe8020800,
+0xf00a0400,
+0xe8060000,
+0x000e0c00,
+0x0005e000,
+0xf0061000,
+0xf805c000,
+0xf8063000,
+0xf0062000,
+0xf0088400,
+0xf0058000,
+0x00067000,
+0xf8066000,
+0xf808c400,
+0xf0064000,
+0x0008e400,
+0xf0030800,
+0xf80b0400,
+0xe8050000,
+0x000d0c00,
+0x0006e000,
+0xf0084400,
+0xf806c000,
+0xf8086400,
+0x00087400,
+0x000b8400,
+0xf0068000,
+0xf0082400,
+0xf8083400,
+0xf0003800,
+0xf0081400,
+0xe8001800,
+0xe0000800,
+0xe8080400,
+// Negative running disparity:
+0x280ffc00,
+0x1007f000,
+0x0807e000,
+0x200fec00,
+0x0007c000,
+0x180fcc00,
+0x200fdc00,
+0x0807d000,
+0x100c7c00,
+0x100f8c00,
+0x180f9c00,
+0x00079000,
+0x200fbc00,
+0x0807b000,
+0x0007a000,
+0x0802f800,
+0x180cfc00,
+0x0004f000,
+0x100f1c00,
+0x100cec00,
+0x180f3c00,
+0x00073000,
+0x100cdc00,
+0x00027800,
+0x200f7c00,
+0x08077000,
+0x00076000,
+0x0009c400,
+0x100cbc00,
+0x0809e400,
+0x1009f400,
+0x0801f800,
+0x200dfc00,
+0x0805f000,
+0x0005e000,
+0x180dec00,
+0x100e3c00,
+0x100dcc00,
+0x180ddc00,
+0x08037800,
+0x180e7c00,
+0x00067000,
+0x100d9c00,
+0x00033800,
+0x180dbc00,
+0x0008e400,
+0x0808f400,
+0x0000f800,
+0x200efc00,
+0x0806f000,
+0x0006e000,
+0x0803b800,
+0x100d3c00,
+0x00039800,
+0x00087400,
+0x000b8400,
+0x180d7c00,
+0x0803d800,
+0x0003c800,
+0x080bc400,
+0x0803e800,
+0x100be400,
+0x180bf400,
+0x1003f800,