-
Notifications
You must be signed in to change notification settings - Fork 268
/
diff.cpp
490 lines (445 loc) · 16.5 KB
/
diff.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
#include "config.h"
#include "diff.h"
#include "util.h"
#include "display.h"
#include "gpu.h"
#include "spi.h"
Span *spans = 0;
#ifdef UPDATE_FRAMES_WITHOUT_DIFFING
// Naive non-diffing functionality: just submit the whole display contents
void NoDiffChangedRectangle(Span *&head)
{
head = spans;
head->x = 0;
head->endX = head->lastScanEndX = gpuFrameWidth;
head->y = 0;
head->endY = gpuFrameHeight;
head->size = gpuFrameWidth*gpuFrameHeight;
head->next = 0;
}
#endif
#ifdef UPDATE_FRAMES_IN_SINGLE_RECTANGULAR_DIFF
// Coarse diffing of two framebuffers with tight stride, 16 pixels at a time
// Finds the first changed pixel, coarse result aligned down to 8 pixels boundary
static int coarse_linear_diff(uint16_t *framebuffer, uint16_t *prevFramebuffer, uint16_t *framebufferEnd)
{
uint16_t *endPtr;
asm volatile(
"mov r0, %[framebufferEnd]\n" // r0 <- pointer to end of current framebuffer
"mov r1, %[framebuffer]\n" // r1 <- current framebuffer
"mov r2, %[prevFramebuffer]\n" // r2 <- framebuffer of previous frame
"start_%=:\n"
"pld [r1, #128]\n" // preload data caches for both current and previous framebuffers 128 bytes ahead of time
"pld [r2, #128]\n"
"ldmia r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements (8 pixels) of current framebuffer
"ldmia r2!, {r7,r8,r9,r10}\n" // load corresponding 4x32-bit elements (8 pixels) of previous framebuffer
"cmp r3, r7\n" // compare all 8 pixels if they are different
"cmpeq r4, r8\n"
"cmpeq r5, r9\n"
"cmpeq r6, r10\n"
"bne end_%=\n" // if we found a difference, we are done
// Unroll once for another set of 4x32-bit elements. On Raspberry Pi Zero, data cache line is 32 bytes in size, so one iteration
// of the loop computes a single data cache line, with preloads in place at the top.
"ldmia r1!, {r3,r4,r5,r6}\n"
"ldmia r2!, {r7,r8,r9,r10}\n"
"cmp r3, r7\n"
"cmpeq r4, r8\n"
"cmpeq r5, r9\n"
"cmpeq r6, r10\n"
"bne end_%=\n" // if we found a difference, we are done
"cmp r0, r1\n" // framebuffer == framebufferEnd? did we finish through the array?
"bne start_%=\n"
"b done_%=\n"
"end_%=:\n"
"sub r1, r1, #16\n" // ldmia r1! increments r1 after load, so subtract back the last increment in order to not shoot past the first changed pixels
"done_%=:\n"
"mov %[endPtr], r1\n" // output endPtr back to C code
: [endPtr]"=r"(endPtr)
: [framebuffer]"r"(framebuffer), [prevFramebuffer]"r"(prevFramebuffer), [framebufferEnd]"r"(framebufferEnd)
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
);
return endPtr - framebuffer;
}
// Same as coarse_linear_diff, but finds the last changed pixel in linear order instead of first, i.e.
// Finds the last changed pixel, coarse result aligned up to 8 pixels boundary
static int coarse_backwards_linear_diff(uint16_t *framebuffer, uint16_t *prevFramebuffer, uint16_t *framebufferEnd)
{
uint16_t *endPtr;
asm volatile(
"mov r0, %[framebufferBegin]\n" // r0 <- pointer to beginning of current framebuffer
"mov r1, %[framebuffer]\n" // r1 <- current framebuffer (starting from end of framebuffer)
"mov r2, %[prevFramebuffer]\n" // r2 <- framebuffer of previous frame (starting from end of framebuffer)
"start_%=:\n"
"pld [r1, #-128]\n" // preload data caches for both current and previous framebuffers 128 bytes ahead of time
"pld [r2, #-128]\n"
"ldmdb r1!, {r3,r4,r5,r6}\n" // load 4x32-bit elements (8 pixels) of current framebuffer
"ldmdb r2!, {r7,r8,r9,r10}\n" // load corresponding 4x32-bit elements (8 pixels) of previous framebuffer
"cmp r3, r7\n" // compare all 8 pixels if they are different
"cmpeq r4, r8\n"
"cmpeq r5, r9\n"
"cmpeq r6, r10\n"
"bne end_%=\n" // if we found a difference, we are done
// Unroll once for another set of 4x32-bit elements. On Raspberry Pi Zero, data cache line is 32 bytes in size, so one iteration
// of the loop computes a single data cache line, with preloads in place at the top.
"ldmdb r1!, {r3,r4,r5,r6}\n"
"ldmdb r2!, {r7,r8,r9,r10}\n"
"cmp r3, r7\n"
"cmpeq r4, r8\n"
"cmpeq r5, r9\n"
"cmpeq r6, r10\n"
"bne end_%=\n" // if we found a difference, we are done
"cmp r0, r1\n" // framebuffer == framebufferEnd? did we finish through the array?
"bne start_%=\n"
"b done_%=\n"
"end_%=:\n"
"add r1, r1, #16\n" // ldmdb r1! decrements r1 before load, so add back the last decrement in order to not shoot past the first changed pixels
"done_%=:\n"
"mov %[endPtr], r1\n" // output endPtr back to C code
: [endPtr]"=r"(endPtr)
: [framebuffer]"r"(framebufferEnd), [prevFramebuffer]"r"(prevFramebuffer+(framebufferEnd-framebuffer)), [framebufferBegin]"r"(framebuffer)
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc"
);
return endPtr - framebuffer;
}
void DiffFramebuffersToSingleChangedRectangle(uint16_t *framebuffer, uint16_t *prevFramebuffer, Span *&head)
{
int minY = 0;
int minX = -1;
const int stride = gpuFramebufferScanlineStrideBytes>>1; // Stride as uint16 elements.
const int WidthAligned4 = (uint32_t)gpuFrameWidth & ~3u;
uint16_t *scanline = framebuffer;
uint16_t *prevScanline = prevFramebuffer;
static const bool framebufferSizeCompatibleWithCoarseDiff = gpuFramebufferScanlineStrideBytes == gpuFrameWidth*2 && gpuFramebufferScanlineStrideBytes*gpuFrameHeight % 32 == 0;
if (framebufferSizeCompatibleWithCoarseDiff)
{
int numPixels = gpuFrameWidth*gpuFrameHeight;
int firstDiff = coarse_linear_diff(framebuffer, prevFramebuffer, framebuffer + numPixels);
if (firstDiff == numPixels)
return; // No pixels changed, nothing to do.
// Coarse diff computes a diff at 8 adjacent pixels at a time, and returns the point to the 8-pixel aligned coordinate where the pixels began to differ.
// Compute the precise diff position here.
while(framebuffer[firstDiff] == prevFramebuffer[firstDiff]) ++firstDiff;
minX = firstDiff % gpuFrameWidth;
minY = firstDiff / gpuFrameWidth;
}
else
{
while(minY < gpuFrameHeight)
{
int x = 0;
// diff 4 pixels at a time
for(; x < WidthAligned4; x += 4)
{
uint64_t diff = *(uint64_t*)(scanline+x) ^ *(uint64_t*)(prevScanline+x);
if (diff)
{
minX = x + (__builtin_ctzll(diff) >> 4);
goto found_top;
}
}
// tail unaligned 0-3 pixels one by one
for(; x < gpuFrameWidth; ++x)
{
uint16_t diff = *(scanline+x) ^ *(prevScanline+x);
if (diff)
{
minX = x;
goto found_top;
}
}
scanline += stride;
prevScanline += stride;
++minY;
}
return; // No pixels changed, nothing to do.
}
found_top:
int maxX = -1;
int maxY = gpuFrameHeight-1;
if (framebufferSizeCompatibleWithCoarseDiff)
{
int numPixels = gpuFrameWidth*gpuFrameHeight;
int firstDiff = coarse_backwards_linear_diff(framebuffer, prevFramebuffer, framebuffer + numPixels);
// Coarse diff computes a diff at 8 adjacent pixels at a time, and returns the point to the 8-pixel aligned coordinate where the pixels began to differ.
// Compute the precise diff position here.
while(firstDiff > 0 && framebuffer[firstDiff] == prevFramebuffer[firstDiff]) --firstDiff;
maxX = firstDiff % gpuFrameWidth;
maxY = firstDiff / gpuFrameWidth;
}
else
{
scanline = framebuffer + (gpuFrameHeight - 1)*stride;
prevScanline = prevFramebuffer + (gpuFrameHeight - 1)*stride; // (same scanline from previous frame, not preceding scanline)
while(maxY >= minY)
{
int x = gpuFrameWidth-1;
// tail unaligned 0-3 pixels one by one
for(; x >= WidthAligned4; --x)
{
if (scanline[x] != prevScanline[x])
{
maxX = x;
goto found_bottom;
}
}
// diff 4 pixels at a time
x = x & ~3u;
for(; x >= 0; x -= 4)
{
uint64_t diff = *(uint64_t*)(scanline+x) ^ *(uint64_t*)(prevScanline+x);
if (diff)
{
maxX = x + 3 - (__builtin_clzll(diff) >> 4);
goto found_bottom;
}
}
scanline -= stride;
prevScanline -= stride;
--maxY;
}
}
found_bottom:
scanline = framebuffer + minY*stride;
prevScanline = prevFramebuffer + minY*stride;
int lastScanEndX = maxX;
if (minX > maxX) SWAPU32(minX, maxX);
int leftX = 0;
while(leftX < minX)
{
uint16_t *s = scanline + leftX;
uint16_t *prevS = prevScanline + leftX;
for(int y = minY; y <= maxY; ++y)
{
if (*s != *prevS)
goto found_left;
s += stride;
prevS += stride;
}
++leftX;
}
found_left:
int rightX = gpuFrameWidth-1;
while(rightX > maxX)
{
uint16_t *s = scanline + rightX;
uint16_t *prevS = prevScanline + rightX;
for(int y = minY; y <= maxY; ++y)
{
if (*s != *prevS)
goto found_right;
s += stride;
prevS += stride;
}
--rightX;
}
found_right:
head = spans;
head->x = leftX;
head->endX = rightX+1;
head->lastScanEndX = lastScanEndX+1;
head->y = minY;
head->endY = maxY+1;
#if defined(ALIGN_DIFF_TASKS_FOR_32B_CACHE_LINES) && defined(ALL_TASKS_SHOULD_DMA)
// Make sure the task is a multiple of 32 bytes wide so we can use a fast DMA copy
// algorithm later on. Currently this is only exploited in dma.cpp if ALL_TASKS_SHOULD_DMA
// option is enabled, so only enable it there.
head->x = MAX(0, ALIGN_DOWN(head->x, 16));
head->endX = MIN(gpuFrameWidth, ALIGN_UP(head->endX, 16));
head->lastScanEndX = ALIGN_UP(head->lastScanEndX, 16);
#endif
head->size = (head->endX-head->x)*(head->endY-head->y-1) + (head->lastScanEndX - head->x);
head->next = 0;
}
#endif
void DiffFramebuffersToScanlineSpansFastAndCoarse4Wide(uint16_t *framebuffer, uint16_t *prevFramebuffer, bool interlacedDiff, int interlacedFieldParity, Span *&head)
{
int numSpans = 0;
int y = interlacedDiff ? interlacedFieldParity : 0;
int yInc = interlacedDiff ? 2 : 1;
// If doing an interlaced update, skip over every second scanline.
int scanlineInc = interlacedDiff ? (gpuFramebufferScanlineStrideBytes>>2) : (gpuFramebufferScanlineStrideBytes>>3);
uint64_t *scanline = (uint64_t *)(framebuffer + y*(gpuFramebufferScanlineStrideBytes>>1));
uint64_t *prevScanline = (uint64_t *)(prevFramebuffer + y*(gpuFramebufferScanlineStrideBytes>>1)); // (same scanline from previous frame, not preceding scanline)
const int W = gpuFrameWidth>>2;
Span *span = spans;
while(y < gpuFrameHeight)
{
uint16_t *scanlineStart = (uint16_t *)scanline;
for(int x = 0; x < W;)
{
if (scanline[x] != prevScanline[x])
{
uint16_t *spanStart = (uint16_t *)(scanline + x) + (__builtin_ctzll(scanline[x] ^ prevScanline[x]) >> 4);
++x;
// We've found a start of a span of different pixels on this scanline, now find where this span ends
uint16_t *spanEnd;
for(;;)
{
if (x < W)
{
if (scanline[x] != prevScanline[x])
{
++x;
continue;
}
else
{
spanEnd = (uint16_t *)(scanline + x) + 1 - (__builtin_clzll(scanline[x-1] ^ prevScanline[x-1]) >> 4);
++x;
break;
}
}
else
{
spanEnd = scanlineStart + gpuFrameWidth;
break;
}
}
// Submit the span update task
span->x = spanStart - scanlineStart;
span->endX = span->lastScanEndX = spanEnd - scanlineStart;
span->y = y;
span->endY = y+1;
span->size = spanEnd - spanStart;
span->next = span+1;
++span;
++numSpans;
}
else
{
++x;
}
}
y += yInc;
scanline += scanlineInc;
prevScanline += scanlineInc;
}
if (numSpans > 0)
{
head = &spans[0];
spans[numSpans-1].next = 0;
}
else
head = 0;
}
void DiffFramebuffersToScanlineSpansExact(uint16_t *framebuffer, uint16_t *prevFramebuffer, bool interlacedDiff, int interlacedFieldParity, Span *&head)
{
int numSpans = 0;
int y = interlacedDiff ? interlacedFieldParity : 0;
int yInc = interlacedDiff ? 2 : 1;
// If doing an interlaced update, skip over every second scanline.
int scanlineInc = interlacedDiff ? gpuFramebufferScanlineStrideBytes : (gpuFramebufferScanlineStrideBytes>>1);
int scanlineEndInc = scanlineInc - gpuFrameWidth;
uint16_t *scanline = framebuffer + y*(gpuFramebufferScanlineStrideBytes>>1);
uint16_t *prevScanline = prevFramebuffer + y*(gpuFramebufferScanlineStrideBytes>>1); // (same scanline from previous frame, not preceding scanline)
while(y < gpuFrameHeight)
{
uint16_t *scanlineStart = scanline;
uint16_t *scanlineEnd = scanline + gpuFrameWidth;
while(scanline < scanlineEnd)
{
uint16_t *spanStart;
uint16_t *spanEnd;
int numConsecutiveUnchangedPixels = 0;
if (scanline + 1 < scanlineEnd)
{
uint32_t diff = (*(uint32_t *)scanline) ^ (*(uint32_t *)prevScanline);
scanline += 2;
prevScanline += 2;
if (diff == 0) // Both 1st and 2nd pixels are the same
continue;
if (diff & 0xFFFF == 0) // 1st pixels are the same, 2nd pixels are not
{
spanStart = scanline - 1;
spanEnd = scanline;
}
else // 1st pixels are different
{
spanStart = scanline - 2;
if ((diff & 0xFFFF0000u) != 0) // 2nd pixels are different?
{
spanEnd = scanline;
}
else
{
spanEnd = scanline - 1;
numConsecutiveUnchangedPixels = 1;
}
}
// We've found a start of a span of different pixels on this scanline, now find where this span ends
while(scanline < scanlineEnd)
{
if (*scanline++ != *prevScanline++)
{
spanEnd = scanline;
numConsecutiveUnchangedPixels = 0;
}
else
{
if (++numConsecutiveUnchangedPixels > SPAN_MERGE_THRESHOLD)
break;
}
}
}
else // handle the single last pixel on the row
{
if (*scanline++ == *prevScanline++)
break;
spanStart = scanline - 1;
spanEnd = scanline;
}
// Submit the span update task
Span *span = spans + numSpans;
span->x = spanStart - scanlineStart;
span->endX = span->lastScanEndX = spanEnd - scanlineStart;
span->y = y;
span->endY = y+1;
span->size = spanEnd - spanStart;
if (numSpans > 0) span[-1].next = span;
else head = span;
span->next = 0;
++numSpans;
}
y += yInc;
scanline += scanlineEndInc;
prevScanline += scanlineEndInc;
}
}
void MergeScanlineSpanList(Span *listHead)
{
for(Span *i = listHead; i; i = i->next)
{
Span *prev = i;
for(Span *j = i->next; j; j = j->next)
{
// If the spans i and j are vertically apart, don't attempt to merge span i any further, since all spans >= j will also be farther vertically apart.
// (the list is nondecreasing with respect to Span::y)
if (j->y > i->endY) break;
// Merge the spans i and j, and figure out the wastage of doing so
int x = MIN(i->x, j->x);
int y = MIN(i->y, j->y);
int endX = MAX(i->endX, j->endX);
int endY = MAX(i->endY, j->endY);
int lastScanEndX = (endY > i->endY) ? j->lastScanEndX : ((endY > j->endY) ? i->lastScanEndX : MAX(i->lastScanEndX, j->lastScanEndX));
int newSize = (endX-x)*(endY-y-1) + (lastScanEndX - x);
int wastedPixels = newSize - i->size - j->size;
if (wastedPixels <= SPAN_MERGE_THRESHOLD
#ifdef MAX_SPI_TASK_SIZE
&& newSize*SPI_BYTESPERPIXEL <= MAX_SPI_TASK_SIZE
#endif
)
{
i->x = x;
i->y = y;
i->endX = endX;
i->endY = endY;
i->lastScanEndX = lastScanEndX;
i->size = newSize;
prev->next = j->next;
j = prev;
}
else // Not merging - travel to next node remembering where we came from
prev = j;
}
}
}