Skip to content

Commit

Permalink
Merge pull request #70065 from clayjohn/GLES3-attribs
Browse files Browse the repository at this point in the history
Use instanced array buffer instead of UBO for canvas item batching
  • Loading branch information
akien-mga authored Dec 15, 2022
2 parents fac614c + b6a1aa1 commit 47ef054
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 207 deletions.
178 changes: 72 additions & 106 deletions drivers/gles3/rasterizer_canvas_gles3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -568,9 +568,8 @@ void RasterizerCanvasGLES3::_render_items(RID p_to_render_target, int p_item_cou
_new_batch(batch_broken, index);

// Override the start position and index as we want to start from where we finished off last time.
state.canvas_instance_batches[state.current_batch_index].start = r_last_index * sizeof(InstanceData);
state.canvas_instance_batches[state.current_batch_index].start = r_last_index;
index = 0;
_align_instance_data_buffer(index);

for (int i = 0; i < p_item_count; i++) {
Item *ci = items[i];
Expand Down Expand Up @@ -630,14 +629,14 @@ void RasterizerCanvasGLES3::_render_items(RID p_to_render_target, int p_item_cou
}

// Copy over all data needed for rendering.
glBindBuffer(GL_UNIFORM_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].ubo);
glBindBuffer(GL_ARRAY_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].buffer);
#ifdef WEB_ENABLED
glBufferSubData(GL_UNIFORM_BUFFER, r_last_index * sizeof(InstanceData), sizeof(InstanceData) * index, state.instance_data_array);
glBufferSubData(GL_ARRAY_BUFFER, r_last_index * sizeof(InstanceData), sizeof(InstanceData) * index, state.instance_data_array);
#else
// On Desktop and mobile we map the memory without synchronizing for maximum speed.
void *ubo = glMapBufferRange(GL_UNIFORM_BUFFER, r_last_index * sizeof(InstanceData), index * sizeof(InstanceData), GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
memcpy(ubo, state.instance_data_array, index * sizeof(InstanceData));
glUnmapBuffer(GL_UNIFORM_BUFFER);
void *buffer = glMapBufferRange(GL_ARRAY_BUFFER, r_last_index * sizeof(InstanceData), index * sizeof(InstanceData), GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
memcpy(buffer, state.instance_data_array, index * sizeof(InstanceData));
glUnmapBuffer(GL_ARRAY_BUFFER);
#endif

glDisable(GL_SCISSOR_TEST);
Expand All @@ -664,7 +663,17 @@ void RasterizerCanvasGLES3::_render_items(RID p_to_render_target, int p_item_cou
uint64_t specialization = 0;
specialization |= uint64_t(state.canvas_instance_batches[i].lights_disabled);
specialization |= uint64_t(!GLES3::Config::get_singleton()->float_texture_supported) << 1;
bool success = _bind_material(material_data, variant, specialization);
RID shader_version = data.canvas_shader_default_version;

if (material_data) {
if (material_data->shader_data->version.is_valid() && material_data->shader_data->valid) {
// Bind uniform buffer and textures
material_data->bind_uniforms();
shader_version = material_data->shader_data->version;
}
}

bool success = GLES3::MaterialStorage::get_singleton()->shaders.canvas_shader.version_bind_shader(shader_version, variant, specialization);
if (!success) {
continue;
}
Expand Down Expand Up @@ -1217,26 +1226,15 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {

_bind_canvas_texture(state.canvas_instance_batches[p_index].tex, state.canvas_instance_batches[p_index].filter, state.canvas_instance_batches[p_index].repeat);

// Bind the region of the UBO used by this batch.
// If region exceeds the boundary of the UBO, just ignore.
uint32_t range_bytes = data.max_instances_per_batch * sizeof(InstanceData);
if (state.canvas_instance_batches[p_index].start >= (data.max_instances_per_ubo - 1) * sizeof(InstanceData)) {
return;
} else if (state.canvas_instance_batches[p_index].start >= (data.max_instances_per_ubo - data.max_instances_per_batch) * sizeof(InstanceData)) {
// If we have less than a full batch at the end, we can just draw it anyway.
// OpenGL will complain about the UBO being smaller than expected, but it should render fine.
range_bytes = (data.max_instances_per_ubo - 1) * sizeof(InstanceData) - state.canvas_instance_batches[p_index].start;
}

uint32_t range_start = state.canvas_instance_batches[p_index].start;
glBindBufferRange(GL_UNIFORM_BUFFER, INSTANCE_UNIFORM_LOCATION, state.canvas_instance_data_buffers[state.current_buffer].ubo, range_start, range_bytes);

switch (state.canvas_instance_batches[p_index].command_type) {
case Item::Command::TYPE_RECT:
case Item::Command::TYPE_NINEPATCH: {
glBindVertexArray(data.indexed_quad_array);
glDrawElements(GL_TRIANGLES, state.canvas_instance_batches[p_index].instance_count * 6, GL_UNSIGNED_INT, 0);
glBindBuffer(GL_UNIFORM_BUFFER, 0);
glBindBuffer(GL_ARRAY_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].buffer);
uint32_t range_start = state.canvas_instance_batches[p_index].start * sizeof(InstanceData);
_enable_attributes(range_start, false);

glDrawElementsInstanced(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0, state.canvas_instance_batches[p_index].instance_count);
glBindVertexArray(0);

} break;
Expand All @@ -1248,18 +1246,21 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
ERR_FAIL_COND(!pb);

glBindVertexArray(pb->vertex_array);
glBindBuffer(GL_ARRAY_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].buffer);

uint32_t range_start = state.canvas_instance_batches[p_index].start * sizeof(InstanceData);
_enable_attributes(range_start, false);

if (pb->color_disabled && pb->color != Color(1.0, 1.0, 1.0, 1.0)) {
glVertexAttrib4f(RS::ARRAY_COLOR, pb->color.r, pb->color.g, pb->color.b, pb->color.a);
}

if (pb->index_buffer != 0) {
glDrawElements(prim[polygon->primitive], pb->count, GL_UNSIGNED_INT, nullptr);
glDrawElementsInstanced(prim[polygon->primitive], pb->count, GL_UNSIGNED_INT, nullptr, 1);
} else {
glDrawArrays(prim[polygon->primitive], 0, pb->count);
glDrawArraysInstanced(prim[polygon->primitive], 0, pb->count, 1);
}
glBindVertexArray(0);
glBindBuffer(GL_UNIFORM_BUFFER, 0);

if (pb->color_disabled && pb->color != Color(1.0, 1.0, 1.0, 1.0)) {
// Reset so this doesn't pollute other draw calls.
Expand All @@ -1269,14 +1270,16 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {

case Item::Command::TYPE_PRIMITIVE: {
glBindVertexArray(data.canvas_quad_array);
glBindBuffer(GL_ARRAY_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].buffer);
uint32_t range_start = state.canvas_instance_batches[p_index].start * sizeof(InstanceData);
_enable_attributes(range_start, true);

const GLenum primitive[5] = { GL_POINTS, GL_POINTS, GL_LINES, GL_TRIANGLES, GL_TRIANGLES };
int instance_count = state.canvas_instance_batches[p_index].instance_count;
if (instance_count > 1) {
ERR_FAIL_COND(instance_count <= 0);
if (instance_count >= 1) {
glDrawArraysInstanced(primitive[state.canvas_instance_batches[p_index].primitive_points], 0, state.canvas_instance_batches[p_index].primitive_points, instance_count);
} else {
glDrawArrays(primitive[state.canvas_instance_batches[p_index].primitive_points], 0, state.canvas_instance_batches[p_index].primitive_points);
}
glBindBuffer(GL_UNIFORM_BUFFER, 0);

} break;

Expand Down Expand Up @@ -1370,6 +1373,11 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
index_array_gl = mesh_storage->mesh_surface_get_index_buffer(surface, 0);
bool use_index_buffer = false;
glBindVertexArray(vertex_array_gl);
glBindBuffer(GL_ARRAY_BUFFER, state.canvas_instance_data_buffers[state.current_buffer].buffer);

uint32_t range_start = state.canvas_instance_batches[p_index].start * sizeof(InstanceData);
_enable_attributes(range_start, false, instance_count);

if (index_array_gl != 0) {
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_array_gl);
use_index_buffer = true;
Expand All @@ -1396,20 +1404,13 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
}

GLenum primitive_gl = prim[int(primitive)];
if (instance_count == 1) {
if (use_index_buffer) {
glDrawElements(primitive_gl, mesh_storage->mesh_surface_get_vertices_drawn_count(surface), mesh_storage->mesh_surface_get_index_type(surface), 0);
} else {
glDrawArrays(primitive_gl, 0, mesh_storage->mesh_surface_get_vertices_drawn_count(surface));
}
} else if (instance_count > 1) {
if (use_index_buffer) {
glDrawElementsInstanced(primitive_gl, mesh_storage->mesh_surface_get_vertices_drawn_count(surface), mesh_storage->mesh_surface_get_index_type(surface), 0, instance_count);
} else {
glDrawArraysInstanced(primitive_gl, 0, mesh_storage->mesh_surface_get_vertices_drawn_count(surface), instance_count);
}
}

if (use_index_buffer) {
glDrawElementsInstanced(primitive_gl, mesh_storage->mesh_surface_get_vertices_drawn_count(surface), mesh_storage->mesh_surface_get_index_type(surface), 0, instance_count);
} else {
glDrawArraysInstanced(primitive_gl, 0, mesh_storage->mesh_surface_get_vertices_drawn_count(surface), instance_count);
}
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
if (instance_count > 1) {
glDisableVertexAttribArray(5);
Expand All @@ -1429,7 +1430,7 @@ void RasterizerCanvasGLES3::_render_batch(Light *p_lights, uint32_t p_index) {
}

void RasterizerCanvasGLES3::_add_to_batch(uint32_t &r_index, bool &r_batch_broken) {
if (r_index >= data.max_instances_per_ubo - 1) {
if (r_index >= data.max_instances_per_buffer - 1) {
ERR_PRINT_ONCE("Trying to draw too many items. Please increase maximum number of items in the project settings 'rendering/gl_compatibility/item_buffer_size'");
return;
}
Expand Down Expand Up @@ -1457,27 +1458,25 @@ void RasterizerCanvasGLES3::_new_batch(bool &r_batch_broken, uint32_t &r_index)
// Copy the properties of the current batch, we will manually update the things that changed.
Batch new_batch = state.canvas_instance_batches[state.current_batch_index];
new_batch.instance_count = 0;
new_batch.start = state.canvas_instance_batches[state.current_batch_index].start + state.canvas_instance_batches[state.current_batch_index].instance_count * sizeof(InstanceData);
new_batch.start = state.canvas_instance_batches[state.current_batch_index].start + state.canvas_instance_batches[state.current_batch_index].instance_count;

state.current_batch_index++;
state.canvas_instance_batches.push_back(new_batch);
_align_instance_data_buffer(r_index);
}

bool RasterizerCanvasGLES3::_bind_material(GLES3::CanvasMaterialData *p_material_data, CanvasShaderGLES3::ShaderVariant p_variant, uint64_t p_specialization) {
if (p_material_data) {
if (p_material_data->shader_data->version.is_valid() && p_material_data->shader_data->valid) {
// Bind uniform buffer and textures
p_material_data->bind_uniforms();
return GLES3::MaterialStorage::get_singleton()->shaders.canvas_shader.version_bind_shader(p_material_data->shader_data->version, p_variant, p_specialization);
} else {
return GLES3::MaterialStorage::get_singleton()->shaders.canvas_shader.version_bind_shader(data.canvas_shader_default_version, p_variant, p_specialization);
}
} else {
return GLES3::MaterialStorage::get_singleton()->shaders.canvas_shader.version_bind_shader(data.canvas_shader_default_version, p_variant, p_specialization);
void RasterizerCanvasGLES3::_enable_attributes(uint32_t p_start, bool p_primitive, uint32_t p_rate) {
uint32_t split = p_primitive ? 11 : 12;
for (uint32_t i = 6; i < split; i++) {
glEnableVertexAttribArray(i);
glVertexAttribPointer(i, 4, GL_FLOAT, GL_FALSE, sizeof(InstanceData), CAST_INT_TO_UCHAR_PTR(p_start + (i - 6) * 4 * sizeof(float)));
glVertexAttribDivisor(i, p_rate);
}
for (uint32_t i = split; i <= 13; i++) {
glEnableVertexAttribArray(i);
glVertexAttribIPointer(i, 4, GL_UNSIGNED_INT, sizeof(InstanceData), CAST_INT_TO_UCHAR_PTR(p_start + (i - 6) * 4 * sizeof(float)));
glVertexAttribDivisor(i, p_rate);
}
}

RID RasterizerCanvasGLES3::light_create() {
CanvasLight canvas_light;
return canvas_light_owner.make_rid(canvas_light);
Expand Down Expand Up @@ -2416,8 +2415,8 @@ void RasterizerCanvasGLES3::_allocate_instance_data_buffer() {
GLuint new_buffers[3];
glGenBuffers(3, new_buffers);
// Batch UBO.
glBindBuffer(GL_UNIFORM_BUFFER, new_buffers[0]);
glBufferData(GL_UNIFORM_BUFFER, data.max_instance_buffer_size, nullptr, GL_STREAM_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, new_buffers[0]);
glBufferData(GL_ARRAY_BUFFER, data.max_instance_buffer_size, nullptr, GL_STREAM_DRAW);
// Light uniform buffer.
glBindBuffer(GL_UNIFORM_BUFFER, new_buffers[1]);
glBufferData(GL_UNIFORM_BUFFER, sizeof(LightUniform) * data.max_lights_per_render, nullptr, GL_STREAM_DRAW);
Expand All @@ -2427,36 +2426,16 @@ void RasterizerCanvasGLES3::_allocate_instance_data_buffer() {

state.current_buffer = (state.current_buffer + 1);
DataBuffer db;
db.ubo = new_buffers[0];
db.buffer = new_buffers[0];
db.light_ubo = new_buffers[1];
db.state_ubo = new_buffers[2];
db.last_frame_used = RSG::rasterizer->get_frame_number();
state.canvas_instance_data_buffers.insert(state.current_buffer, db);
state.current_buffer = state.current_buffer % state.canvas_instance_data_buffers.size();
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindBuffer(GL_UNIFORM_BUFFER, 0);
}

// Batch start positions need to be aligned to the device's GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT.
// This needs to be called anytime a new batch is created.
void RasterizerCanvasGLES3::_align_instance_data_buffer(uint32_t &r_index) {
if (GLES3::Config::get_singleton()->uniform_buffer_offset_alignment > int(sizeof(InstanceData))) {
uint32_t offset = state.canvas_instance_batches[state.current_batch_index].start % GLES3::Config::get_singleton()->uniform_buffer_offset_alignment;
if (offset > 0) {
// uniform_buffer_offset_alignment can be 4, 16, 32, or 256. Our instance batches are 128 bytes.
// Accordingly, this branch is only triggered if we are 128 bytes off.
uint32_t offset_bytes = GLES3::Config::get_singleton()->uniform_buffer_offset_alignment - offset;
state.canvas_instance_batches[state.current_batch_index].start += offset_bytes;
// Offset the instance array so it stays in sync with batch start points.
// This creates gaps in the instance buffer with wasted space, but we can't help it.
r_index += offset_bytes / sizeof(InstanceData);
if (r_index > 0) {
// In this case we need to copy over the basic data.
state.instance_data_array[r_index] = state.instance_data_array[r_index - 1];
}
}
}
}

void RasterizerCanvasGLES3::set_time(double p_time) {
state.time = p_time;
}
Expand Down Expand Up @@ -2601,63 +2580,50 @@ RasterizerCanvasGLES3::RasterizerCanvasGLES3() {
data.max_instances_per_batch = 128;
} else {
data.max_lights_per_render = 256;
data.max_instances_per_batch = 512;
data.max_instances_per_batch = 2048;
}

// Reserve 3 Uniform Buffers for instance data Frame N, N+1 and N+2
data.max_instances_per_ubo = MAX(data.max_instances_per_batch, uint32_t(GLOBAL_GET("rendering/gl_compatibility/item_buffer_size")));
data.max_instance_buffer_size = data.max_instances_per_ubo * sizeof(InstanceData); // 16,384 instances * 128 bytes = 2,097,152 bytes = 2,048 kb
data.max_instances_per_buffer = MAX(data.max_instances_per_batch, uint32_t(GLOBAL_GET("rendering/gl_compatibility/item_buffer_size")));
data.max_instance_buffer_size = data.max_instances_per_buffer * sizeof(InstanceData); // 16,384 instances * 128 bytes = 2,097,152 bytes = 2,048 kb
state.canvas_instance_data_buffers.resize(3);
state.canvas_instance_batches.reserve(200);

for (int i = 0; i < 3; i++) {
GLuint new_buffers[3];
glGenBuffers(3, new_buffers);
// Batch UBO.
glBindBuffer(GL_UNIFORM_BUFFER, new_buffers[0]);
glBufferData(GL_UNIFORM_BUFFER, data.max_instance_buffer_size, nullptr, GL_STREAM_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, new_buffers[0]);
glBufferData(GL_ARRAY_BUFFER, data.max_instance_buffer_size, nullptr, GL_STREAM_DRAW);
// Light uniform buffer.
glBindBuffer(GL_UNIFORM_BUFFER, new_buffers[1]);
glBufferData(GL_UNIFORM_BUFFER, sizeof(LightUniform) * data.max_lights_per_render, nullptr, GL_STREAM_DRAW);
// State buffer.
glBindBuffer(GL_UNIFORM_BUFFER, new_buffers[2]);
glBufferData(GL_UNIFORM_BUFFER, sizeof(StateBuffer), nullptr, GL_STREAM_DRAW);
DataBuffer db;
db.ubo = new_buffers[0];
db.buffer = new_buffers[0];
db.light_ubo = new_buffers[1];
db.state_ubo = new_buffers[2];
db.last_frame_used = 0;
db.fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
state.canvas_instance_data_buffers[i] = db;
}
glBindBuffer(GL_ARRAY_BUFFER, 0);
glBindBuffer(GL_UNIFORM_BUFFER, 0);

state.instance_data_array = memnew_arr(InstanceData, data.max_instances_per_ubo);
state.instance_data_array = memnew_arr(InstanceData, data.max_instances_per_buffer);
state.light_uniforms = memnew_arr(LightUniform, data.max_lights_per_render);

{
const uint32_t no_of_instances = data.max_instances_per_batch;

const uint32_t indices[6] = { 0, 2, 1, 3, 2, 0 };
glGenVertexArrays(1, &data.indexed_quad_array);
glBindVertexArray(data.indexed_quad_array);
glBindBuffer(GL_ARRAY_BUFFER, data.canvas_quad_vertices);

const uint32_t num_indices = 6;
const uint32_t quad_indices[num_indices] = { 0, 2, 1, 3, 2, 0 };

const uint32_t total_indices = no_of_instances * num_indices;
uint32_t *indices = new uint32_t[total_indices];
for (uint32_t i = 0; i < total_indices; i++) {
uint32_t quad = i / num_indices;
uint32_t quad_local = i % num_indices;
indices[i] = quad_indices[quad_local] + quad * num_indices;
}

glGenBuffers(1, &data.indexed_quad_buffer);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, data.indexed_quad_buffer);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(uint32_t) * total_indices, indices, GL_STATIC_DRAW);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(uint32_t) * 6, indices, GL_STATIC_DRAW);
glBindVertexArray(0);
delete[] indices;
}

String global_defines;
Expand Down
5 changes: 3 additions & 2 deletions drivers/gles3/rasterizer_canvas_gles3.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class RasterizerCanvasGLES3 : public RendererCanvasRender {
uint32_t max_lights_per_render = 256;
uint32_t max_lights_per_item = 16;
uint32_t max_instances_per_batch = 512;
uint32_t max_instances_per_ubo = 16384;
uint32_t max_instances_per_buffer = 16384;
uint32_t max_instance_buffer_size = 16384 * 128;
} data;

Expand Down Expand Up @@ -278,7 +278,7 @@ class RasterizerCanvasGLES3 : public RendererCanvasRender {
// We track them and ensure that they don't get reused until at least 2 frames have passed
// to avoid the GPU stalling to wait for a resource to become available.
struct DataBuffer {
GLuint ubo = 0;
GLuint buffer = 0;
GLuint light_ubo = 0;
GLuint state_ubo = 0;
uint64_t last_frame_used = -3;
Expand Down Expand Up @@ -359,6 +359,7 @@ class RasterizerCanvasGLES3 : public RendererCanvasRender {
void _add_to_batch(uint32_t &r_index, bool &r_batch_broken);
void _allocate_instance_data_buffer();
void _align_instance_data_buffer(uint32_t &r_index);
void _enable_attributes(uint32_t p_start, bool p_primitive, uint32_t p_rate = 1);

void set_time(double p_time);

Expand Down
Loading

0 comments on commit 47ef054

Please sign in to comment.