Skip to content

Commit

Permalink
feat(video): reduce branching for faster video render task (#89)
Browse files Browse the repository at this point in the history
* feat(video): reduce branching for faster video render task

* simplify frame receive in video task
  • Loading branch information
finger563 authored Oct 15, 2024
1 parent 3f72f4c commit 2f2e219
Showing 1 changed file with 51 additions and 39 deletions.
90 changes: 51 additions & 39 deletions components/box-emu/src/box-emu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -487,12 +487,12 @@ void BoxEmu::palette(const uint16_t *palette, size_t size) {
palette_size_ = size;
}

void BoxEmu::push_frame(const void* frame) {
void IRAM_ATTR BoxEmu::push_frame(const void* frame) {
if (video_queue_ == nullptr) {
logger_.error("video queue is null, make sure to call initialize_video() first!");
return;
}
xQueueSend(video_queue_, &frame, 10 / portTICK_PERIOD_MS);
xQueueSend(video_queue_, &frame, 5 / portTICK_PERIOD_MS);
}

VideoSetting BoxEmu::video_setting() const {
Expand Down Expand Up @@ -717,28 +717,23 @@ const uint16_t* BoxEmu::palette() const {

bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
const void *_frame_ptr;
if (xQueuePeek(video_queue_, &_frame_ptr, 100 / portTICK_PERIOD_MS) != pdTRUE) {
// we couldn't get anything from the queue, return
return false;
}
if (_frame_ptr == nullptr) {
// make sure we clear the queue
xQueueReceive(video_queue_, &_frame_ptr, 10 / portTICK_PERIOD_MS);
// we got a nullptr, return
if (xQueueReceive(video_queue_, &_frame_ptr, portMAX_DELAY) != pdTRUE) {
return false;
}
static constexpr int num_lines_to_write = num_rows_in_framebuffer;
auto &box = espp::EspBox::get();
static int vram_index = 0; // has to be static so that it persists between calls
static uint16_t vram_index = 0; // has to be static so that it persists between calls
const int _x_offset = x_offset();
const int _y_offset = y_offset();
const uint16_t* _palette = palette();
uint16_t *vram0 = (uint16_t*)box.vram0();
uint16_t *vram1 = (uint16_t*)box.vram1();
if (is_native()) {
for (int y=0; y<display_height_; y+= num_lines_to_write) {
uint16_t* _buf = vram_index ? (uint16_t*)box.vram1() : (uint16_t*)box.vram0();
vram_index = vram_index ? 0 : 1;
int num_lines = std::min<int>(num_lines_to_write, display_height_-y);
if (has_palette()) {
if (has_palette()) {
for (int y=0; y<display_height_; y+= num_lines_to_write) {
uint16_t* _buf = (uint16_t*)((uint32_t)vram0 * (vram_index ^ 0x01) + (uint32_t)vram1 * vram_index);
vram_index = vram_index ^ 0x01;
int num_lines = std::min<int>(num_lines_to_write, display_height_-y);
const uint8_t* _frame = (const uint8_t*)_frame_ptr;
for (int i=0; i<num_lines; i++) {
// write two pixels (32 bits) at a time because it's faster
Expand All @@ -749,7 +744,14 @@ bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
_buf[dst_index + 1] = _palette[_frame[src_index + 1] % palette_size_];
}
}
} else {
box.write_lcd_frame(_x_offset, y + _y_offset, display_width_, num_lines, (uint8_t*)&_buf[0]);
}
} else {
// no palette
for (int y=0; y<display_height_; y+= num_lines_to_write) {
uint16_t* _buf = (uint16_t*)((uint32_t)vram0 * (vram_index ^ 0x01) + (uint32_t)vram1 * vram_index);
vram_index = vram_index ^ 0x01;
int num_lines = std::min<int>(num_lines_to_write, display_height_-y);
const uint16_t* _frame = (const uint16_t*)_frame_ptr;
for (int i=0; i<num_lines; i++) {
// write two pixels (32 bits) at a time because it's faster
Expand All @@ -761,8 +763,8 @@ bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
_buf[dst_index + 1] = _frame[src_index + 1];
}
}
box.write_lcd_frame(_x_offset, y + _y_offset, display_width_, num_lines, (uint8_t*)&_buf[0]);
}
box.write_lcd_frame(_x_offset, y + _y_offset, display_width_, num_lines, (uint8_t*)&_buf[0]);
}
} else {
// we are scaling the screen (and possibly using a custom palette)
Expand All @@ -773,21 +775,19 @@ bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
float inv_y_scale = (float)native_height_/display_height_;
int max_y = espp::EspBox::lcd_height();
int max_x = std::clamp<int>(x_scale * native_width_, 0, espp::EspBox::lcd_width());
for (int y=0; y<max_y; y+=num_lines_to_write) {
// each iteration of the loop, we swap the vram index so that we can
// write to the other buffer while the other one is being transmitted
int i = 0;
uint16_t* _buf = vram_index ? (uint16_t*)box.vram1() : (uint16_t*)box.vram0();
vram_index = vram_index ? 0 : 1;
for (; i<num_lines_to_write; i++) {
int _y = y+i;
if (_y >= max_y) {
break;
}
int source_y = (float)_y * inv_y_scale;
// shoudl i put this around the outer loop or is this loop a good
// balance for perfomance of the check?
if (has_palette()) {
if (has_palette()) {
for (int y=0; y<max_y; y+=num_lines_to_write) {
// each iteration of the loop, we swap the vram index so that we can
// write to the other buffer while the other one is being transmitted
int i = 0;
uint16_t* _buf = (uint16_t*)((uint32_t)vram0 * (vram_index ^ 0x01) + (uint32_t)vram1 * vram_index);
vram_index = vram_index ^ 0x01;
for (; i<num_lines_to_write; i++) {
int _y = y+i;
if (_y >= max_y) {
break;
}
int source_y = (float)_y * inv_y_scale;
const uint8_t* _frame = (const uint8_t*)_frame_ptr;
// write two pixels (32 bits) at a time because it's faster
for (int x=0; x<max_x/2; x++) {
Expand All @@ -797,7 +797,23 @@ bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
_buf[dst_index] = _palette[_frame[src_index] % palette_size_];
_buf[dst_index + 1] = _palette[_frame[src_index + 1] % palette_size_];
}
} else {
}
box.write_lcd_frame(0 + _x_offset, y, max_x, i, (uint8_t*)&_buf[0]);
}
} else {
// no palette
for (int y=0; y<max_y; y+=num_lines_to_write) {
// each iteration of the loop, we swap the vram index so that we can
// write to the other buffer while the other one is being transmitted
int i = 0;
uint16_t* _buf = (uint16_t*)((uint32_t)vram0 * (vram_index ^ 0x01) + (uint32_t)vram1 * vram_index);
vram_index = vram_index ^ 0x01;
for (; i<num_lines_to_write; i++) {
int _y = y+i;
if (_y >= max_y) {
break;
}
int source_y = (float)_y * inv_y_scale;
const uint16_t* _frame = (const uint16_t*)_frame_ptr;
// write two pixels (32 bits) at a time because it's faster
for (int x=0; x<max_x/2; x++) {
Expand All @@ -808,13 +824,9 @@ bool BoxEmu::video_task_callback(std::mutex &m, std::condition_variable& cv) {
_buf[dst_index + 1] = _frame[src_index + 1];
}
}
box.write_lcd_frame(0 + _x_offset, y, max_x, i, (uint8_t*)&_buf[0]);
}
box.write_lcd_frame(0 + _x_offset, y, max_x, i, (uint8_t*)&_buf[0]);
}
}

// we don't have to worry here since we know there was an item in the queue
// since we peeked earlier.
xQueueReceive(video_queue_, &_frame_ptr, 10 / portTICK_PERIOD_MS);
return false;
}

0 comments on commit 2f2e219

Please sign in to comment.