diff --git a/CMakeLists.txt b/CMakeLists.txt
index 139cbc3..633b310 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,9 +71,12 @@ if (ARMV8A)
 endif()
 
 set(GPIO_TFT_DATA_CONTROL 0 CACHE STRING "Explicitly specify the Data/Control GPIO pin (sometimes also called Register Select)")
-if (GPIO_TFT_DATA_CONTROL)
-	message(STATUS "Using GPIO pin ${GPIO_TFT_DATA_CONTROL} for Data/Control line")
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPIO_TFT_DATA_CONTROL=${GPIO_TFT_DATA_CONTROL}")
+if (GPIO_TFT_DATA_CONTROL GREATER 0)
+	message(STATUS "Using 4-wire SPI mode of communication, with GPIO pin ${GPIO_TFT_DATA_CONTROL} for Data/Control line")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPIO_TFT_DATA_CONTROL=${GPIO_TFT_DATA_CONTROL} -DSPI_4WIRE=1")
+elseif (GPIO_TFT_DATA_CONTROL LESS 0)
+	message(STATUS "Using 3-wire SPI mode of communication, i.e. a display that does not havea a Data/Control line")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPI_3WIRE=1")
 endif()
 
 set(GPIO_TFT_RESET_PIN 0 CACHE STRING "Explicitly specify the Reset GPIO pin (leave out if there is no Reset line)")
diff --git a/README.md b/README.md
index 1414fdd..2e064aa 100644
--- a/README.md
+++ b/README.md
@@ -130,7 +130,7 @@ If you connected wires directly on the Pi instead of using a Hat from the above
 
 And additionally, pass the following to customize the GPIO pin assignments you used:
 
-- `-DGPIO_TFT_DATA_CONTROL=number`: Specifies/overrides which GPIO pin to use for the Data/Control (DC) line on the 4-wire SPI communication. This pin number is specified in BCM pin numbers.
+- `-DGPIO_TFT_DATA_CONTROL=number`: Specifies/overrides which GPIO pin to use for the Data/Control (DC) line on the 4-wire SPI communication. This pin number is specified in BCM pin numbers. If you have a 3-wire SPI display that does not have a Data/Control line, **set this value to -1**, i.e. `-DGPIO_TFT_DATA_CONTROL=-1` to tell fbcp-ili9341 to target 3-wire ("9-bit") SPI communication.
 - `-DGPIO_TFT_RESET_PIN=number`: Specifies/overrides which GPIO pin to use for the display Reset line. This pin number is specified in BCM pin numbers. If omitted, it is assumed that the display does not have a Reset pin, and is always on.
 - `-DGPIO_TFT_BACKLIGHT=number`: Specifies/overrides which GPIO pin to use for the display backlight line. This pin number is specified in BCM pin numbers. If omitted, it is assumed that the display does not have a GPIO-controlled backlight pin, and is always on. If setting this, also see the `#define BACKLIGHT_CONTROL` option in `config.h`.
 
@@ -395,7 +395,13 @@ If fbcp-ili9341 does not support your display controller, you will have to write
 
 #### Does fbcp-ili9341 work with 3-wire SPI displays?
 
-No, only 4-wire SPI displays work. Make sure the display has a Data/Control (DC) GPIO pin to connect.
+Yes! This is a more recent experimental feature that may not be as stable, and there are some limitations, but 3-wire ("9-bit") SPI display support is now available. If you have a 3-wire SPI display, i.e. one that does not have a Data/Control (DC) GPIO pin to connect, configure it via CMake with directive `-D-DGPIO_TFT_DATA_CONTROL=-1` to tell fbcp-ili9341 that it should be driving the display with 3-wire protocol.
+
+Current limitations of 3-wire communication are:
+ - The performance option `ALL_TASKS_SHOULD_DMA` is currently not supported, there is an issue with DMA chaining that prevents this from being enabled. As result, CPU usage on 3-wire displays will be slightly higher than on 4-wire displays.
+ - The performance option `OFFLOAD_PIXEL_COPY_TO_DMA_CPP` is currently not supported. As a result, 3-wire displays may not work that well on single core Pis like Pi Zero.
+ - This has only been tested on my Adafruit SSD1351 128x96 RGB OLED display, which can be soldered to operate in 3-wire SPI mode, so testing has not been particularly extensive.
+ - Displays that have a 16-bit wide command word, such as ILI9486, do not currently work in 3-wire ("17-bit") mode. (But ILI9486L has 8-bit command word, so that does work)
 
 #### Does fbcp-ili9341 work with I2C, DPI, MIPI DSI or USB connected displays?
 
@@ -433,7 +439,7 @@ Unfortunately there are a number of things to go wrong that all result in a whit
 - double check that the display controller is really what you expected. Trying to drive with the display with wrong initialization code usually results in the display not reacting, and the screen stays white,
 - shut down and physically power off the Pi and the display in between multiple tests. Driving a display with a wrong initialization routine may put it in a bad state that needs a physical power off for it to reset,
 - if there is a reset pin on the display, make sure to pass it in CMake line. Or alternatively, try driving fbcp-ili9341 without specifying the reset pin,
-- make sure the display is configured to run 4-wire SPI mode, and not in parallel mode or 3-wire SPI mode. You may need to solder or desolder some connections or set a jumper to configure the specific driving mode.
+- make sure the display is configured to run 4-wire SPI mode, and not in parallel mode or 3-wire SPI mode. You may need to solder or desolder some connections or set a jumper to configure the specific driving mode. Support for 3-wire SPI displays does exist, but it is more limited and a bit experimental.
 
 #### The display stays blank at boot without lighting up
 
@@ -481,7 +487,7 @@ You can also try looking through the commit history to find changes related to y
 
 #### Which SPI display should I buy to make sure it works best with fbcp-ili9341?
 
-First, make sure the display is a 4-wire SPI and not a 3-wire one. fbcp-ili9341 does not currently support 3-wire SPI. A display is 4-wire SPI if it has a Data/Control (DC) GPIO line that needs connecting.
+First, make sure the display is a 4-wire SPI and not a 3-wire one. A display is 4-wire SPI if it has a Data/Control (DC) GPIO line that needs connecting. Sometimes the D/C pin is labeled RS (Register Select). Support for 3-wire SPI displays does exist, but it is experimental and not nearly as well tested as 4-wire displays.
 
 Second is the consideration about display speed. Below is a performance chart of the different displays I have tested. Note that these are sample sizes of one, I don't know how much sample variance there exists. Also I don't know if it is likely that there exists big differences between displays with same controller from different manufacturers. At least the different ILI9341 displays that I have are all quite consistent on performance, whether they are from Adafruit or WaveShare or from BuyDisplay.com.
 
diff --git a/config.h b/config.h
index ced2b29..9869a23 100644
--- a/config.h
+++ b/config.h
@@ -110,7 +110,7 @@
 // requires that ALL_TASKS_SHOULD_DMA is also enabled.
 // #define UPDATE_FRAMES_WITHOUT_DIFFING
 
-#if defined(SINGLE_CORE_BOARD) && defined(USE_DMA_TRANSFERS)
+#if defined(SINGLE_CORE_BOARD) && defined(USE_DMA_TRANSFERS) && !defined(SPI_3WIRE) // TODO: 3-wire SPI displays are not yet compatible with ALL_TASKS_SHOULD_DMA option.
 // These are prerequisites for good performance on Pi Zero
 #ifndef ALL_TASKS_SHOULD_DMA
 #define ALL_TASKS_SHOULD_DMA
diff --git a/display.h b/display.h
index 0f8bb8f..990fa3a 100644
--- a/display.h
+++ b/display.h
@@ -87,9 +87,10 @@
 #define SPI_BYTESPERPIXEL 2
 #endif
 
-#if (DISPLAY_DRAWABLE_WIDTH % 16 == 0) && defined(ALL_TASKS_SHOULD_DMA) &&!defined(USE_SPI_THREAD) && defined(USE_GPU_VSYNC) && !defined(DISPLAY_COLOR_FORMAT_R6X2G6X2B6X2)
+#if (DISPLAY_DRAWABLE_WIDTH % 16 == 0) && defined(ALL_TASKS_SHOULD_DMA) &&!defined(USE_SPI_THREAD) && defined(USE_GPU_VSYNC) && !defined(DISPLAY_COLOR_FORMAT_R6X2G6X2B6X2) && !defined(SPI_3WIRE)
 // If conditions are suitable, defer moving pixels until the very last moment in dma.cpp when we are about
 // to kick off DMA tasks.
+// TODO: 3-wire SPI displays are not yet compatible with this path. Implement support for this to optimize performance of 3-wire SPI displays on Pi Zero. (Pi 3B does not care that much)
 #define OFFLOAD_PIXEL_COPY_TO_DMA_CPP
 #endif
 
@@ -106,6 +107,6 @@ void DeinitSPIDisplay(void);
 #error Please define -DSPI_BUS_CLOCK_DIVISOR=<some even number> on the CMake command line! This parameter along with core_freq=xxx in /boot/config.txt defines the SPI display speed. (spi speed = core_freq / SPI_BUS_CLOCK_DIVISOR)
 #endif
 
-#if !defined(GPIO_TFT_DATA_CONTROL)
+#if !defined(GPIO_TFT_DATA_CONTROL) && !defined(SPI_3WIRE)
 #error Please reconfigure CMake with -DGPIO_TFT_DATA_CONTROL=<int> specifying which pin your display is using for the Data/Control line!
 #endif
diff --git a/dma.cpp b/dma.cpp
index b1e9ccb..1903041 100644
--- a/dma.cpp
+++ b/dma.cpp
@@ -488,6 +488,17 @@ static void memcpy_to_dma_and_prev_framebuffer_in_c(uint16_t *dstDma, uint16_t *
   *dstPrevFramebuffer = prevData;
 }
 
+#if defined(ALL_TASKS_SHOULD_DMA) && defined(SPI_3WIRE)
+// Bug: there is something about the chained DMA transfer mechanism that makes write window coordinate set commands not go through properly
+// on 3-wire displays, but do not yet know what. (Remove this #error statement to debug)
+#error ALL_TASKS_SHOULD_DMA and SPI_3WIRE are currently not mutually compatible!
+#endif
+
+#if defined(OFFLOAD_PIXEL_COPY_TO_DMA_CPP) && defined(SPI_3WIRE)
+// We would have to convert 8-bit tasks to 9-bit tasks immediately after offloaded memcpy has been done below to implement this.
+#error OFFLOAD_PIXEL_COPY_TO_DMA_CPP and SPI_3WIRE are not mutually compatible!
+#endif
+
 void SPIDMATransfer(SPITask *task)
 {
 // There is a limit to how many bytes can be sent in one DMA-based SPI task, so if the task
@@ -495,9 +506,9 @@ void SPIDMATransfer(SPITask *task)
 // and chain them together. This should be a multiple of 32 bytes to keep tasks cache aligned on ARMv6.
 #define MAX_DMA_SPI_TASK_SIZE 65504
 
-  const int numDMASendTasks = (task->size + MAX_DMA_SPI_TASK_SIZE - 1) / MAX_DMA_SPI_TASK_SIZE;
+  const int numDMASendTasks = (task->PayloadSize() + MAX_DMA_SPI_TASK_SIZE - 1) / MAX_DMA_SPI_TASK_SIZE;
 
-  volatile uint32_t *dmaData = (volatile uint32_t *)GrabFreeDMASourceBytes(4*(numDMASendTasks-1)+4*numDMASendTasks+task->size);
+  volatile uint32_t *dmaData = (volatile uint32_t *)GrabFreeDMASourceBytes(4*(numDMASendTasks-1)+4*numDMASendTasks+task->PayloadSize());
   volatile uint32_t *setDMATxAddressData = dmaData;
   volatile uint32_t *txData = dmaData+numDMASendTasks-1;
 
@@ -510,12 +521,12 @@ void SPIDMATransfer(SPITask *task)
 #ifdef OFFLOAD_PIXEL_COPY_TO_DMA_CPP
   uint8_t *data = task->fb;
   uint8_t *prevData = task->prevFb;
-  const bool taskAndFramebufferSizesCompatibleWithTightMemcpy = (task->size % 32 == 0) && (task->width % 16 == 0);
+  const bool taskAndFramebufferSizesCompatibleWithTightMemcpy = (task->PayloadSize() % 32 == 0) && (task->width % 16 == 0);
 #else
-  uint8_t *data = task->data;
+  uint8_t *data = task->PayloadStart();
 #endif
 
-  int bytesLeft = task->size;
+  int bytesLeft = task->PayloadSize();
   int taskStartX = 0;
 
   while(bytesLeft > 0)
@@ -629,10 +640,12 @@ void SPIDMATransfer(SPITask *task)
   }
   if (!programRunning) return;
 
-  pendingTaskBytes = task->size;
+  pendingTaskBytes = task->PayloadSize();
 
   // First send the SPI command byte in Polled SPI mode
   spi->cs = BCM2835_SPI0_CS_TA | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
+
+#ifdef SPI_4WIRE
   CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
 #ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
   spi->fifo = 0;
@@ -647,6 +660,8 @@ void SPIDMATransfer(SPITask *task)
 #endif
 
   SET_GPIO(GPIO_TFT_DATA_CONTROL);
+#endif
+
   spi->cs = BCM2835_SPI0_CS_DMAEN | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
 
   dmaTx->cbAddr = VIRT_TO_BUS(dmaCb, tx0);
@@ -663,20 +678,21 @@ void SPIDMATransfer(SPITask *task)
 {
   // Transition the SPI peripheral to enable the use of DMA
   spi->cs = BCM2835_SPI0_CS_DMAEN | BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
-  task->dmaSpiHeader = BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS | (task->size << 16); // The first four bytes written to the SPI data register control the DLEN and CS,CPOL,CPHA settings.
+  uint32_t *headerAddr = task->DmaSpiHeaderAddress();
+  *headerAddr = BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS | (task->PayloadSize() << 16); // The first four bytes written to the SPI data register control the DLEN and CS,CPOL,CPHA settings.
 
   // TODO: Ideally we would be able to directly perform the DMA from the SPI ring buffer from 'task' pointer. However
   // that pointer is shared to userland, and it is proving troublesome to make it both userland-writable as well as cache-bypassing DMA coherent.
   // Therefore these two memory areas are separate for now, and we memcpy() from SPI ring buffer to an intermediate 'dmaSourceMemory' memory area to perform
   // the DMA transfer. Is there a way to avoid this intermediate buffer? That would improve performance a bit.
-  memcpy(dmaSourceBuffer.virtualAddr, (void*)&task->dmaSpiHeader, task->size + 4);
+  memcpy(dmaSourceBuffer.virtualAddr, headerAddr, task->PayloadSize() + 4);
 
   volatile DMAControlBlock *cb = (volatile DMAControlBlock *)dmaCb.virtualAddr;
   volatile DMAControlBlock *txcb = &cb[0];
   txcb->ti = BCM2835_DMA_TI_PERMAP(BCM2835_DMA_TI_PERMAP_SPI_TX) | BCM2835_DMA_TI_DEST_DREQ | BCM2835_DMA_TI_SRC_INC | BCM2835_DMA_TI_WAIT_RESP;
   txcb->src = dmaSourceBuffer.busAddress;
   txcb->dst = DMA_SPI_FIFO_PHYS_ADDRESS; // Write out to the SPI peripheral 
-  txcb->len = task->size + 4;
+  txcb->len = task->PayloadSize() + 4;
   txcb->stride = 0;
   txcb->next = 0;
   txcb->debug = 0;
@@ -687,7 +703,7 @@ void SPIDMATransfer(SPITask *task)
   rxcb->ti = BCM2835_DMA_TI_PERMAP(BCM2835_DMA_TI_PERMAP_SPI_RX) | BCM2835_DMA_TI_SRC_DREQ | BCM2835_DMA_TI_DEST_IGNORE;
   rxcb->src = DMA_SPI_FIFO_PHYS_ADDRESS;
   rxcb->dst = 0;
-  rxcb->len = task->size;
+  rxcb->len = task->PayloadSize();
   rxcb->stride = 0;
   rxcb->next = 0;
   rxcb->debug = 0;
@@ -699,7 +715,7 @@ void SPIDMATransfer(SPITask *task)
   dmaRx->cs = BCM2835_DMA_CS_ACTIVE;
   __sync_synchronize();
 
-  double pendingTaskUSecs = task->size * spiUsecsPerByte;
+  double pendingTaskUSecs = task->PayloadSize() * spiUsecsPerByte;
   if (pendingTaskUSecs > 70)
     usleep(pendingTaskUSecs-70);
 
diff --git a/fbcp-ili9341.cpp b/fbcp-ili9341.cpp
index ac539e2..a7d324f 100644
--- a/fbcp-ili9341.cpp
+++ b/fbcp-ili9341.cpp
@@ -463,7 +463,7 @@ int main()
       SPITask *task = AllocTask(i->size*SPI_BYTESPERPIXEL);
       task->cmd = DISPLAY_WRITE_PIXELS;
 
-      bytesTransferred += task->size+1;
+      bytesTransferred += task->PayloadSize()+1;
       uint16_t *scanline = framebuffer[0] + i->y * (gpuFramebufferScanlineStrideBytes>>1);
       uint16_t *prevScanline = framebuffer[1] + i->y * (gpuFramebufferScanlineStrideBytes>>1);
 
diff --git a/spi.cpp b/spi.cpp
index d6dda2b..368ce77 100644
--- a/spi.cpp
+++ b/spi.cpp
@@ -19,6 +19,11 @@
 #include "mailbox.h"
 #include "mem_alloc.h"
 
+#if defined(DISPLAY_SPI_BUS_IS_16BITS_WIDE) && defined(SPI_3WIRE)
+// I do not have any 3-wire ILI9486 displays to test support for this
+#error TODO: 3-wire displays that have 16-bits wide command word are not currently implemented! (that is, ILI9486 does not work in 3-wire configuration) (Only 3-wire displays with 8-bit wide command words are currently supported)
+#endif
+
 int mem_fd = -1;
 volatile void *bcm2835 = 0;
 volatile GPIORegisterFile *gpio = 0;
@@ -82,6 +87,131 @@ void SetRealtimeThreadPriority()
 bool previousTaskWasSPI = true;
 #endif
 
+#ifdef SPI_3WIRE
+
+uint32_t NumBytesNeededFor9BitSPITask(uint32_t byteSizeFor8BitTask)
+{
+  uint32_t numOutBits = (byteSizeFor8BitTask + 1) * 9;
+  // The number of bits we send out in a command must be a multiple of 9 bits, because each byte is 1 data/command bit plus 8 payload bits
+  // But the number of bits sent out in a command must also be a multiple of 8 bits, because BCM2835 SPI peripheral only deals with sending out full bytes.
+  // Therefore the bits written out must be a multiple of lcm(9*8)=72bits.
+  numOutBits = ((numOutBits + 71) / 72) * 72;
+  uint32_t numOutBytes = numOutBits >> 3;
+  return numOutBytes;
+}
+
+// N.B. BCM2835 hardware always clocks bytes out most significant bit (MSB) first, so when interleaving, the command bit needs to start out in the
+// highest byte of the outgoing buffer.
+void Interleave8BitSPITaskTo9Bit(SPITask *task)
+{
+  const uint32_t size8BitTask = task->size - task->size9BitTaskWithPadding;
+
+  // 9-bit SPI task lives right at the end of the 8-bit task
+  uint8_t *dst = task->data + size8BitTask;
+
+  // Pre-clear the 9*8=72 bit tail end of the memory to all zeroes to avoid having to pad source data to multiples of 9. (plus padding bytes, just to be safe)
+  memset(dst + task->size9BitTaskWithPadding - 9 - SPI_9BIT_TASK_PADDING_BYTES, 0, 9 + SPI_9BIT_TASK_PADDING_BYTES);
+
+  // Fill first command byte xxxxxxxx -> 0xxxxxxx x: (low 0 bit to indicate a command byte)
+  dst[0] = task->cmd >> 1;
+  dst[1] = task->cmd << 7;
+  int dstByte = 1;
+  int dstBitsUsed = 1;
+
+  int src = 0;
+
+  // Command bit above produced one byte. If there are at least 7 bytes in the data set, we can complete a set of 8 transferred bytes. Fast track
+  // that:
+  if (size8BitTask >= 7)
+  {
+    dst[1] |= 0x40 |                        (task->data[0] >> 2);
+    dst[2]  = 0x20 | (task->data[0] << 6) | (task->data[1] >> 3);
+    dst[3]  = 0x10 | (task->data[1] << 5) | (task->data[2] >> 4);
+    dst[4]  = 0x08 | (task->data[2] << 4) | (task->data[3] >> 5);
+    dst[5]  = 0x04 | (task->data[3] << 3) | (task->data[4] >> 6);
+    dst[6]  = 0x02 | (task->data[4] << 2) | (task->data[5] >> 7);
+    dst[7]  = 0x01 | (task->data[5] << 1);
+    dst[8]  =        (task->data[6]     );
+    dstByte = 9;
+    dstBitsUsed = 0;
+    src = 7;
+
+    // More fast tracking: As long as we have multiples of 8 bytes left, fast fill them in
+    while(src <= size8BitTask - 8)
+    {
+      uint8_t *d = dst + dstByte;
+      dstByte += 9;
+      const uint8_t *s = task->data + src;
+      src += 8;
+
+      d[0] = 0x80 |               (s[0] >> 1);
+      d[1] = 0x40 | (s[0] << 7) | (s[1] >> 2);
+      d[2] = 0x20 | (s[1] << 6) | (s[2] >> 3);
+      d[3] = 0x10 | (s[2] << 5) | (s[3] >> 4);
+      d[4] = 0x08 | (s[3] << 4) | (s[4] >> 5);
+      d[5] = 0x04 | (s[4] << 3) | (s[5] >> 6);
+      d[6] = 0x02 | (s[5] << 2) | (s[6] >> 7);
+      d[7] = 0x01 | (s[6] << 1);
+      d[8] = (s[7]     );
+    }
+
+    // Pre-clear the next byte to be written - the slow loop below assumes it is continuing a middle of byte sequence
+    // N.B. This write could happen to memory that is not part of the task, so memory allocation of the 9-bit task needs to allocate one byte of padding
+    dst[dstByte] = 0;
+  }
+
+  // Fill tail data bytes, slow path
+  while(src < size8BitTask)
+  {
+    uint8_t data = task->data[src++];
+
+    // High 1 bit to indicate a data byte
+    dst[dstByte] |= 1 << (7 - dstBitsUsed);
+    ++dstBitsUsed;
+    if (dstBitsUsed == 8) // Written data bit completes a full byte?
+    {
+      ++dstByte; // Advance to next byte
+      dstBitsUsed = 0;
+      // Now we are aligned, so can write the data byte directly
+      dst[dstByte++] = data;
+      dst[dstByte] = 0; // Clear old contents of the next byte to write
+    }
+    else
+    {
+      // 8 data bits
+      dst[dstByte++] |= data >> dstBitsUsed;
+      // This is the first write to the next byte, that should occur without ORring to clear old data in memory
+      // N.B. This write could happen to memory that is not part of the task, so memory allocation of the 9-bit task needs to allocate one byte of padding
+      dst[dstByte] = data << (8 - dstBitsUsed);
+    }
+  }
+
+#if 0 // Enable to debug correctness:
+
+#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c"
+#define BYTE_TO_BINARY(byte)  \
+  (byte & 0x80 ? '1' : '0'), \
+  (byte & 0x40 ? '1' : '0'), \
+  (byte & 0x20 ? '1' : '0'), \
+  (byte & 0x10 ? '1' : '0'), \
+  (byte & 0x08 ? '1' : '0'), \
+  (byte & 0x04 ? '1' : '0'), \
+  (byte & 0x02 ? '1' : '0'), \
+  (byte & 0x01 ? '1' : '0')
+
+  printf("Interleaving result: 8-bit task of size %d bytes became %d bytes:\n", task->size - task->size9BitTaskWithPadding, task->size9BitTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES);
+  printf("8-bit c" BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(task->cmd));
+  for(int i = 0; i < task->size - task->size9BitTaskWithPadding; ++i)
+    printf("d" BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(task->data[i]));
+  printf("\n9-bit ");
+  for(int i = 0; i < task->size9BitTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES; ++i)
+    printf(BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(dst[i]));
+  printf("\n\n");
+#endif
+
+}
+#endif // ~SPI_3WIRE
+
 void WaitForPolledSPITransferToFinish()
 {
   uint32_t cs;
@@ -98,16 +228,18 @@ void WaitForPolledSPITransferToFinish()
 void RunSPITask(SPITask *task)
 {
   uint32_t cs;
-  uint8_t *tStart = task->data;
-  uint8_t *tEnd = task->data + task->size;
+  uint8_t *tStart = task->PayloadStart();
+  uint8_t *tEnd = task->PayloadEnd();
+  const uint32_t payloadSize = tEnd - tStart;
+  uint8_t *tPrefillEnd = tStart + MIN(15, payloadSize);
 
 #define TASK_SIZE_TO_USE_DMA 4
   // Do a DMA transfer if this task is suitable in size for DMA to handle
-  if (task->size >= TASK_SIZE_TO_USE_DMA && (task->cmd == DISPLAY_WRITE_PIXELS || task->cmd == DISPLAY_SET_CURSOR_X || task->cmd == DISPLAY_SET_CURSOR_Y))
+  if (payloadSize >= TASK_SIZE_TO_USE_DMA && (task->cmd == DISPLAY_WRITE_PIXELS || task->cmd == DISPLAY_SET_CURSOR_X || task->cmd == DISPLAY_SET_CURSOR_Y))
   {
     if (previousTaskWasSPI)
       WaitForPolledSPITransferToFinish();
-//    printf("DMA cmd=0x%x, data=%d bytes\n", task->cmd, task->size);
+//    printf("DMA cmd=0x%x, data=%d bytes\n", task->cmd, task->PayloadSize());
     SPIDMATransfer(task);
     previousTaskWasSPI = false;
   }
@@ -123,7 +255,10 @@ void RunSPITask(SPITask *task)
     else
       WaitForPolledSPITransferToFinish();
 
-//    printf("SPI cmd=0x%x, data=%d bytes\n", task->cmd, task->size);
+//    printf("SPI cmd=0x%x, data=%d bytes\n", task->cmd, task->PayloadSize());
+
+  // Send the command word if display is 4-wire (3-wire displays can omit this, commands are interleaved in the data payload stream above)
+#ifdef SPI_4WIRE
     CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
 
 #ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
@@ -132,7 +267,6 @@ void RunSPITask(SPITask *task)
 #endif
     spi->fifo = task->cmd;
 
-    uint8_t *tPrefillEnd = task->data + MIN(15, task->size);
 #ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
     while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/;
     spi->fifo;
@@ -142,7 +276,9 @@ void RunSPITask(SPITask *task)
 #endif
 
     SET_GPIO(GPIO_TFT_DATA_CONTROL);
+#endif
 
+    // Send the data payload:
     while(tStart < tPrefillEnd) spi->fifo = *tStart++;
     while(tStart < tEnd)
     {
@@ -156,6 +292,7 @@ void RunSPITask(SPITask *task)
   }
 }
 #else
+
 void RunSPITask(SPITask *task)
 {
   WaitForPolledSPITransferToFinish();
@@ -168,6 +305,13 @@ void RunSPITask(SPITask *task)
   BEGIN_SPI_COMMUNICATION();
 #endif
 
+  uint8_t *tStart = task->PayloadStart();
+  uint8_t *tEnd = task->PayloadEnd();
+  const uint32_t payloadSize = tEnd - tStart;
+  uint8_t *tPrefillEnd = tStart + MIN(15, payloadSize);
+
+  // Send the command word if display is 4-wire (3-wire displays can omit this, commands are interleaved in the data payload stream above)
+#ifdef SPI_4WIRE
   // An SPI transfer to the display always starts with one control (command) byte, followed by N data bytes.
   CLEAR_GPIO(GPIO_TFT_DATA_CONTROL);
 
@@ -177,9 +321,6 @@ void RunSPITask(SPITask *task)
 #endif
   spi->fifo = task->cmd;
 
-  uint8_t *tStart = task->data;
-  uint8_t *tEnd = task->data + task->size;
-  uint8_t *tPrefillEnd = task->data + MIN(15, task->size);
 #ifdef DISPLAY_SPI_BUS_IS_16BITS_WIDE
   while(!(spi->cs & (BCM2835_SPI0_CS_DONE))) /*nop*/;
   spi->fifo;
@@ -189,6 +330,7 @@ void RunSPITask(SPITask *task)
 #endif
 
   SET_GPIO(GPIO_TFT_DATA_CONTROL);
+#endif // ~SPI_4WIRE
 
 // For small transfers, using DMA is not worth it, but pushing through with polled SPI gives better bandwidth.
 // For larger transfers though that are more than this amount of bytes, using DMA is faster.
@@ -222,7 +364,6 @@ void RunSPITask(SPITask *task)
 }
 #endif
 
-
 SharedMemory *spiTaskMemory = 0;
 volatile uint64_t spiThreadIdleUsecs = 0;
 volatile uint64_t spiThreadSleepStartTime = 0;
@@ -247,7 +388,7 @@ SPITask *GetTask() // Returns the first task in the queue, called in worker thre
 
 void DoneTask(SPITask *task) // Frees the first SPI task from the queue, called in worker thread
 {
-  __atomic_fetch_sub(&spiTaskMemory->spiBytesQueued, task->size+1, __ATOMIC_RELAXED);
+  __atomic_fetch_sub(&spiTaskMemory->spiBytesQueued, task->PayloadSize()+1, __ATOMIC_RELAXED);
   spiTaskMemory->queueHead = (uint32_t)((uint8_t*)task - spiTaskMemory->buffer) + sizeof(SPITask) + task->size;
   __sync_synchronize();
 }
@@ -345,7 +486,9 @@ int InitSPI()
 
 #if !defined(KERNEL_MODULE_CLIENT) || defined(KERNEL_MODULE_CLIENT_DRIVES)
   // By default all GPIO pins are in input mode (0x00), initialize them for SPI and GPIO writes
+#ifdef GPIO_TFT_DATA_CONTROL
   SET_GPIO_MODE(GPIO_TFT_DATA_CONTROL, 0x01); // Data/Control pin to output (0x01)
+#endif
   SET_GPIO_MODE(GPIO_SPI0_MISO, 0x04);
   SET_GPIO_MODE(GPIO_SPI0_MOSI, 0x04);
   SET_GPIO_MODE(GPIO_SPI0_CLK, 0x04);
@@ -441,7 +584,9 @@ void DeinitSPI()
   spi->cs = BCM2835_SPI0_CS_CLEAR | DISPLAY_SPI_DRIVE_SETTINGS;
 
 #ifndef KERNEL_MODULE_CLIENT
+#ifdef GPIO_TFT_DATA_CONTROL
   SET_GPIO_MODE(GPIO_TFT_DATA_CONTROL, 0);
+#endif
   SET_GPIO_MODE(GPIO_SPI0_CE1, 0);
   SET_GPIO_MODE(GPIO_SPI0_CE0, 0);
   SET_GPIO_MODE(GPIO_SPI0_MISO, 0);
diff --git a/spi.h b/spi.h
index 98b9b37..f473d5f 100644
--- a/spi.h
+++ b/spi.h
@@ -87,6 +87,9 @@ extern volatile SPIRegisterFile *spi;
 #define SHARED_MEMORY_SIZE (DISPLAY_DRAWABLE_WIDTH*DISPLAY_DRAWABLE_HEIGHT*SPI_BYTESPERPIXEL*3)
 #define SPI_QUEUE_SIZE (SHARED_MEMORY_SIZE - sizeof(SharedMemory))
 
+// Need a bit of padding for 8-bit -> 9-bit expansion for performance
+#define SPI_9BIT_TASK_PADDING_BYTES 1
+
 // Defines the maximum size of a single SPI task, in bytes. This excludes the command byte. If MAX_SPI_TASK_SIZE
 // is not defined, there is no length limit that applies. (In ALL_TASKS_SHOULD_DMA version of DMA transfer,
 // there is DMA chaining, so SPI tasks can be arbitrarily long)
@@ -96,7 +99,10 @@ extern volatile SPIRegisterFile *spi;
 
 typedef struct __attribute__((packed)) SPITask
 {
-  uint32_t size;
+  uint32_t size; // Size, including both 8-bit and 9-bit tasks
+#ifdef SPI_3WIRE
+  uint32_t size9BitTaskWithPadding; // Size of the 9-bit task. The 9-bit task starts at address spiTask->data + spiTask->size - spiTask->size9BitTaskWithPadding;
+#endif
   uint8_t cmd;
   uint32_t dmaSpiHeader;
 #ifdef OFFLOAD_PIXEL_COPY_TO_DMA_CPP
@@ -104,7 +110,20 @@ typedef struct __attribute__((packed)) SPITask
   uint8_t *prevFb;
   uint16_t width;
 #endif
-  uint8_t data[];
+  uint8_t data[]; // Contains both 8-bit and 9-bit tasks back to back, 8-bit first, then 9-bit.
+
+#ifdef SPI_4WIRE
+  inline uint8_t *PayloadStart() { return data; }
+  inline uint8_t *PayloadEnd() { return data + size; }
+  inline uint32_t PayloadSize() const { return size; }
+  inline uint32_t *DmaSpiHeaderAddress() { return &dmaSpiHeader; }
+#else
+  inline uint8_t *PayloadStart() { return data + (size - size9BitTaskWithPadding); }
+  inline uint8_t *PayloadEnd() { return data + (size - SPI_9BIT_TASK_PADDING_BYTES); }
+  inline uint32_t PayloadSize() const { return size9BitTaskWithPadding - SPI_9BIT_TASK_PADDING_BYTES; }
+  inline uint32_t *DmaSpiHeaderAddress() { return (uint32_t*)(PayloadStart()-4); }
+#endif
+
 } SPITask;
 
 #define BEGIN_SPI_COMMUNICATION() do { spi->cs = BCM2835_SPI0_CS_TA | DISPLAY_SPI_DRIVE_SETTINGS; } while(0)
@@ -231,9 +250,27 @@ extern volatile int spiThreadSleeping;
 
 extern int mem_fd;
 
+#ifdef SPI_3WIRE
+
+// Converts the given SPI task in-place from an 8-bit task to a 9-bit task.
+void Interleave8BitSPITaskTo9Bit(SPITask *task);
+
+// If the given display is a 3-wire SPI display (9 bits/task instead of 8 bits/task), this function computes the byte size of the 8-bit task when it is converted to a 9-bit task.
+uint32_t NumBytesNeededFor9BitSPITask(uint32_t byteSizeFor8BitTask);
+
+#endif
+
 static inline SPITask *AllocTask(uint32_t bytes) // Returns a pointer to a new SPI task block, called on main thread
 {
-  uint32_t bytesToAllocate = sizeof(SPITask) + bytes;
+#ifdef SPI_3WIRE
+  // For 3-wire/9-bit tasks, store the converted task right at the end of the 8-bit task.
+  uint32_t size9BitTaskWithPadding = NumBytesNeededFor9BitSPITask(bytes) + SPI_9BIT_TASK_PADDING_BYTES;
+  bytes += size9BitTaskWithPadding;
+#else
+//  const uint32_t totalBytesFor9BitTask = 0;
+#endif
+
+  uint32_t bytesToAllocate = sizeof(SPITask) + bytes;// + totalBytesFor9BitTask;
   uint32_t tail = spiTaskMemory->queueTail;
   uint32_t newTail = tail + bytesToAllocate;
   // Is the new task too large to write contiguously into the ring buffer, that it's split into two parts? We never split,
@@ -280,6 +317,9 @@ static inline SPITask *AllocTask(uint32_t bytes) // Returns a pointer to a new S
 
   SPITask *task = (SPITask*)(spiTaskMemory->buffer + tail);
   task->size = bytes;
+#ifdef SPI_3WIRE
+  task->size9BitTaskWithPadding = size9BitTaskWithPadding;
+#endif
 #ifdef OFFLOAD_PIXEL_COPY_TO_DMA_CPP
   task->fb = &task->data[0];
   task->prevFb = 0;
@@ -289,12 +329,15 @@ static inline SPITask *AllocTask(uint32_t bytes) // Returns a pointer to a new S
 
 static inline void CommitTask(SPITask *task) // Advertises the given SPI task from main thread to worker, called on main thread
 {
+#ifdef SPI_3WIRE
+  Interleave8BitSPITaskTo9Bit(task);
+#endif
   __sync_synchronize();
 #if !defined(KERNEL_MODULE_CLIENT) && !defined(KERNEL_MODULE)
   uint32_t tail = spiTaskMemory->queueTail;
 #endif
   spiTaskMemory->queueTail = (uint32_t)((uint8_t*)task - spiTaskMemory->buffer) + sizeof(SPITask) + task->size;
-  __atomic_fetch_add(&spiTaskMemory->spiBytesQueued, task->size+1, __ATOMIC_RELAXED);
+  __atomic_fetch_add(&spiTaskMemory->spiBytesQueued, task->PayloadSize()+1, __ATOMIC_RELAXED);
   __sync_synchronize();
 #if !defined(KERNEL_MODULE_CLIENT) && !defined(KERNEL_MODULE)
   if (spiTaskMemory->queueHead == tail) syscall(SYS_futex, &spiTaskMemory->queueTail, FUTEX_WAKE, 1, 0, 0, 0); // Wake the SPI thread if it was sleeping to get new tasks