Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use 2 symbols per word on full res encode #43

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion software/apps/vista/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ target_compile_definitions(vista PRIVATE
DVI_DEFAULT_SERIAL_CONFIG=${DVI_DEFAULT_SERIAL_CONFIG}
DVI_VERTICAL_REPEAT=1
DVI_N_TMDS_BUFFERS=5
DVI_SYMBOLS_PER_WORD=1
DVI_SYMBOLS_PER_WORD=2
)

target_compile_definitions(vista PRIVATE PICO_STACK_SIZE=0x200)
Expand Down
6 changes: 3 additions & 3 deletions software/apps/vista/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ struct semaphore dvi_start_sem;

static inline void prepare_scanline(const uint32_t *colourbuf, uint32_t *tmdsbuf) {
const uint pixwidth = 640;
tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + 0 * pixwidth, pixwidth, 4, 0);
tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + 1 * pixwidth, pixwidth, 10, 5);
tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + 2 * pixwidth, pixwidth, 15, 11);
tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf, pixwidth, 4, 0);
tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + (pixwidth >> 1), pixwidth, 10, 5);
tmds_encode_data_channel_fullres_16bpp(colourbuf, tmdsbuf + pixwidth, pixwidth, 15, 11);
}

void __no_inline_not_in_flash_func(flash_bulk_dma_start)(uint32_t *rxbuf, uint32_t flash_offs, size_t len, uint dma_chan)
Expand Down
111 changes: 70 additions & 41 deletions software/libdvi/tmds_encode.S
Original file line number Diff line number Diff line change
Expand Up @@ -372,24 +372,30 @@ tmds_2bpp_table:
// Here is an idea
// Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in
// ACCUM0), concatenated with the sign bit of our running disparity (from
// ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential),
// with the symbol's disparity stored left-justified in the upper 12 bits, as
// e.g. a 6 bit signed integer.
// ACCUM1). Each table entry is a 10-bit TMDS symbol, with the symbol's disparity
// stored left-justified in the upper bits, as e.g. a 5 bit signed integer.
//
// - Load pixel data. cyc: 0.75 (ldmia 2 words, every 4 pixels)
// We have 2 copies of the table, one with the TMDS symbol aligned to the right
// and one shifted 10 bits left. This means they can be ORed together to form
// 2 symbols in one word.
//
// - Load pixel data. cyc: 0.625 (ldmia 4 words, every 8 pixels)
// - Write pixel to ACCUM0. cyc: 1
// - Read address from PEEK2. cyc: 1
// - Load encoded pixel from address. cyc: 2
// - Write disparity data to ACCUM1_ADD cyc: 1
// - Write encoded data to output buffer. cyc: 1.25 (stmia 4 words, every 4 pixels)
// - OR every 2 pix cyc: 0.5
// - Write encoded data to output buffer. cyc: 0.625 (stmia 4 words, every 8 pixels)
//
// With decent register allocation we may be able to load 4 pixels at
// once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix.
// With decent register allocation we can load 8 pixels at once (4 words),
// and write 8 at once (4 words). This gives 6.75 cyc/pix.
//
// One issue is that the TMDS data in the bottom of ACCUM1 will eventually
// overflow and affect the running disparity, but with 16 zeroes in between,
// this would take much longer than one scanline, so everything is fine if
// we clear the accumulator at the start of the scanline.
// overflow and affect the running disparity. For the right aligned symbols there are
// 16 zeroes in between, so this would take much longer than one scanline, so
// everything is fine if we clear the accumulator at the start of the scanline.
// For the shifted symbols this overflows after 128 symbols, so we need to clear the
// bottom bits of the accumulator more often than that.
//
// Note that we need to use two interpolators to get the bits from both pixels
// -- we are not outputting a single DC-balanced stream, but rather two
Expand All @@ -404,19 +410,20 @@ tmds_2bpp_table:
// much better, and many monitors will still accept the signals as long as you
// DC couple your DVI signals.

.macro tmds_fullres_encode_loop_body ra rb
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
str \ra, [r2, #ACCUM0_OFFS]
ldr \ra, [r2, #PEEK2_OFFS]
ldr \ra, [\ra]
.macro tmds_fullres_encode_loop_body rd
str \rd, [r2, #ACCUM0_OFFS + INTERP1]
str \rd, [r2, #ACCUM0_OFFS]
ldr \rd, [r2, #PEEK2_OFFS]
ldr \rd, [\rd]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
str \rd, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
ldr \rb, [\rb]
ldr r7, [r2, #PEEK2_OFFS + INTERP1]
ldr r7, [r7]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
orrs \rd, r7
.endm

// r0: Input buffer (word-aligned)
Expand All @@ -429,7 +436,7 @@ tmds_2bpp_table:
push {r4}


lsls r2, #2
lsls r2, #1
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
Expand All @@ -449,12 +456,21 @@ tmds_2bpp_table:
b 2f
.align 2
1:
.rept 16
ldmia r0!, {r4, r6}
tmds_fullres_encode_loop_body r4 r5
tmds_fullres_encode_loop_body r6 r7
stmia r1!, {r4, r5, r6, r7}
.rept 10 // 80 pixels per iteration
ldmia r0!, {r3, r4, r5, r6}
tmds_fullres_encode_loop_body r3
tmds_fullres_encode_loop_body r4
tmds_fullres_encode_loop_body r5
tmds_fullres_encode_loop_body r6
stmia r1!, {r3, r4, r5, r6}
.endr
// Need to clear the bottom of the balance tracker for interp1 as the
// otherwise the shifted TMDS symbols will overflow
ldr r7, [r2, #ACCUM1_OFFS + INTERP1]
movs r6, #0x1F
lsls r6, #27
ands r7, r6
str r7, [r2, #ACCUM1_OFFS + INTERP1]
2:
cmp r1, ip
beq 1f
Expand All @@ -472,21 +488,22 @@ decl_func_y tmds_fullres_encode_loop_16bpp_y
tmds_fullres_encode_loop_16bpp


.macro tmds_fullres_encode_loop_body_leftshift ra rb
.macro tmds_fullres_encode_loop_body_leftshift rd
// Note we apply the leftshift for INTERP0 only
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
lsls \ra, r3
str \ra, [r2, #ACCUM0_OFFS]
ldr \ra, [r2, #PEEK2_OFFS]
ldr \ra, [\ra]
str \rd, [r2, #ACCUM0_OFFS + INTERP1]
lsls \rd, r3
str \rd, [r2, #ACCUM0_OFFS]
ldr \rd, [r2, #PEEK2_OFFS]
ldr \rd, [\rd]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
str \rd, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
ldr \rb, [\rb]
ldr r7, [r2, #PEEK2_OFFS + INTERP1]
ldr r7, [r7]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
orrs \rd, r7
.endm

// r0: Input buffer (word-aligned)
Expand All @@ -500,7 +517,7 @@ decl_func_y tmds_fullres_encode_loop_16bpp_y
mov r5, r9
push {r4-r5}

lsls r2, #2
lsls r2, #1
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
Expand All @@ -519,12 +536,24 @@ decl_func_y tmds_fullres_encode_loop_16bpp_y
b 2f
.align 2
1:
.rept 16 // 64 pixels per iteration
ldmia r0!, {r4, r6}
tmds_fullres_encode_loop_body_leftshift r4 r5
tmds_fullres_encode_loop_body_leftshift r6 r7
stmia r1!, {r4, r5, r6, r7}
.rept 13 // 80 pixels per iteration
ldmia r0!, {r4, r5, r6}
tmds_fullres_encode_loop_body_leftshift r4
tmds_fullres_encode_loop_body_leftshift r5
tmds_fullres_encode_loop_body_leftshift r6
stmia r1!, {r4, r5, r6}
.endr
ldmia r0!, {r4}
tmds_fullres_encode_loop_body_leftshift r4
stmia r1!, {r4}

// Need to clear the bottom of the balance tracker for interp1 as the
// otherwise the shifted TMDS symbols will overflow
ldr r7, [r2, #ACCUM1_OFFS + INTERP1]
movs r6, #0x1F
lsls r6, #27
ands r7, r6
str r7, [r2, #ACCUM1_OFFS + INTERP1]
2:
cmp r1, ip
beq 1f
Expand Down
15 changes: 11 additions & 4 deletions software/libdvi/tmds_encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,24 @@ static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
#include "tmds_table.h"
};

// Fullres table is bandwidth-critical, so gets one copy for each scratch
// memory. There is a third copy which can go in flash, because it's just used
// to generate palette LUTs. The ones we don't use will get garbage collected
// during linking.
// Fullres tables are bandwidth-critical, so gets one copy for each scratch
// memory. If we don't use them they will get garbage collected during linking.
const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
#include "tmds_table_fullres.h"
};

const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_shifted_x[] = {
#include "tmds_table_fullres_shifted.h"
};

const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
#include "tmds_table_fullres.h"
};

const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_shifted_y[] = {
#include "tmds_table_fullres_shifted.h"
};

// Configure an interpolator to extract a single colour channel from each of a pair
// of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being
// pixel_width wide. Produce a LUT address for the first pixel's colour data on
Expand Down Expand Up @@ -147,6 +153,7 @@ void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t
// tread on each other's toes too much.
const uint32_t *lutbase = core ? tmds_table_fullres_x : tmds_table_fullres_y;
int lshift_lower = configure_interp_for_addrgen_fullres(interp0_hw, channel_msb, channel_lsb, 6, lutbase);
lutbase = core ? tmds_table_fullres_shifted_x : tmds_table_fullres_shifted_y;
int lshift_upper = configure_interp_for_addrgen_fullres(interp1_hw, channel_msb + 16, channel_lsb + 16, 6, lutbase);
assert(!lshift_upper); (void)lshift_upper;
if (lshift_lower) {
Expand Down
4 changes: 2 additions & 2 deletions software/libdvi/tmds_table_fullres.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Each entry consists of a 10 bit TMDS symbol in pseudo-differential format
// (10 LSBs) and the symbol's disparity as a 6 bit signed integer (the 6
// MSBs). There is a 16 bit gap in between them, which is actually vital for
// (10 LSBs) and the symbol's disparity as a 5 bit signed integer (the 5
// MSBs). There is a 17 bit gap in between them, which is actually vital for
// the way the TMDS encode works!
//
// There are 128 1-word entries. The lookup index should be the concatenation
Expand Down
139 changes: 139 additions & 0 deletions software/libdvi/tmds_table_fullres_shifted.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// Each entry consists of a 10 bit TMDS symbol in pseudo-differential format
// shifted left by 10 bits, and the symbol's disparity as a 5 bit signed integer
// (the 5 MSBs). There is a 7 bit gap in between them, which is actually vital for
// the way the TMDS encode works!
//
// There are 128 1-word entries. The lookup index should be the concatenation
// of the sign bit of current running disparity, with 6 bits of colour channel
// data.

// Non-negative running disparity:
0xe0040000,
0xf80c0c00,
0x000c1c00,
0xe8041000,
0x0007c000,
0xf0043000,
0xe8042000,
0x000c2c00,
0xf8078000,
0xf8047000,
0xf0046000,
0x00079000,
0xe8044000,
0x000c4c00,
0x0007a000,
0xf0090400,
0xf0070000,
0x0004f000,
0xf804e000,
0xf8071000,
0xf004c000,
0x00073000,
0xf8072000,
0xf8098400,
0xe8048000,
0x000c8c00,
0x00076000,
0x0009c400,
0xf8074000,
0xf0021800,
0xe8020800,
0xf00a0400,
0xe8060000,
0x000e0c00,
0x0005e000,
0xf0061000,
0xf805c000,
0xf8063000,
0xf0062000,
0xf0088400,
0xf0058000,
0x00067000,
0xf8066000,
0xf808c400,
0xf0064000,
0x0008e400,
0xf0030800,
0xf80b0400,
0xe8050000,
0x000d0c00,
0x0006e000,
0xf0084400,
0xf806c000,
0xf8086400,
0x00087400,
0x000b8400,
0xf0068000,
0xf0082400,
0xf8083400,
0xf0003800,
0xf0081400,
0xe8001800,
0xe0000800,
0xe8080400,
// Negative running disparity:
0x280ffc00,
0x1007f000,
0x0807e000,
0x200fec00,
0x0007c000,
0x180fcc00,
0x200fdc00,
0x0807d000,
0x100c7c00,
0x100f8c00,
0x180f9c00,
0x00079000,
0x200fbc00,
0x0807b000,
0x0007a000,
0x0802f800,
0x180cfc00,
0x0004f000,
0x100f1c00,
0x100cec00,
0x180f3c00,
0x00073000,
0x100cdc00,
0x00027800,
0x200f7c00,
0x08077000,
0x00076000,
0x0009c400,
0x100cbc00,
0x0809e400,
0x1009f400,
0x0801f800,
0x200dfc00,
0x0805f000,
0x0005e000,
0x180dec00,
0x100e3c00,
0x100dcc00,
0x180ddc00,
0x08037800,
0x180e7c00,
0x00067000,
0x100d9c00,
0x00033800,
0x180dbc00,
0x0008e400,
0x0808f400,
0x0000f800,
0x200efc00,
0x0806f000,
0x0006e000,
0x0803b800,
0x100d3c00,
0x00039800,
0x00087400,
0x000b8400,
0x180d7c00,
0x0803d800,
0x0003c800,
0x080bc400,
0x0803e800,
0x100be400,
0x180bf400,
0x1003f800,