-
-
Notifications
You must be signed in to change notification settings - Fork 21.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Proof of concept fix for Async GPU Readbacks #87850
Conversation
@@ -42,6 +42,7 @@ | |||
#include <stdio.h> | |||
#include <stdlib.h> | |||
#include <string.h> | |||
#include <cstdint> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#include <cstdint> |
Already included (even in the header, only include what's necessary)
@@ -40,6 +40,7 @@ | |||
#include "rendering_device_driver_vulkan.h" | |||
#include "servers/display_server.h" | |||
#include "servers/rendering/renderer_rd/api_context_rd.h" | |||
#include <cstdint> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#include <cstdint> |
@@ -29,6 +29,7 @@ | |||
/**************************************************************************/ | |||
|
|||
#include "rendering_device.h" | |||
#include "core/os/thread_safe.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#include "core/os/thread_safe.h" |
bool RenderingDevice::check_status(){ | ||
_THREAD_SAFE_METHOD_ | ||
|
||
return context->local_device_check_status(local_device); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return context->local_device_check_status(local_device); | |
return context->local_device_check_status(local_device); |
@@ -1333,6 +1333,7 @@ class RenderingDevice : public RenderingDeviceCommons { | |||
|
|||
void submit(); | |||
void sync(); | |||
bool check_status(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bool check_status(); | |
bool check_status(); |
uint64_t out = 0; | ||
vkGetSemaphoreCounterValue(ld->device, compute_semaphore[0], &out); | ||
return out >= compute_semaphore_signal_value; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
uint64_t out = 0; | |
vkGetSemaphoreCounterValue(ld->device, compute_semaphore[0], &out); | |
return out >= compute_semaphore_signal_value; | |
} | |
uint64_t out = 0; | |
vkGetSemaphoreCounterValue(ld->device, compute_semaphore[0], &out); | |
return out >= compute_semaphore_signal_value; | |
} |
@@ -133,6 +134,8 @@ class VulkanContext : public ApiContextRD { | |||
VkFormat format; | |||
VkSemaphore draw_complete_semaphores[FRAME_LAG]; | |||
VkSemaphore image_ownership_semaphores[FRAME_LAG]; | |||
VkSemaphore compute_semaphore[1]; | |||
uint64_t compute_semaphore_signal_value = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
uint64_t compute_semaphore_signal_value = 0; | |
uint64_t compute_semaphore_signal_value = 0; |
submit_info.pSignalSemaphores = nullptr; | ||
submit_info.signalSemaphoreCount = 1; | ||
submit_info.pSignalSemaphores = &compute_semaphore[0]; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
err = vkCreateSemaphore(device, &computeSemaphoreCreateInfo, nullptr, &compute_semaphore[0]); | ||
ERR_FAIL_COND_V(err, ERR_CANT_CREATE); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
err = vkCreateSemaphore(device, &computeSemaphoreCreateInfo, nullptr, &compute_semaphore[0]); | |
ERR_FAIL_COND_V(err, ERR_CANT_CREATE); | |
err = vkCreateSemaphore(device, &computeSemaphoreCreateInfo, nullptr, &compute_semaphore[0]); | |
ERR_FAIL_COND_V(err, ERR_CANT_CREATE); |
// Every time a compute call finishes, we'll increment the value | ||
// of this semaphore by one, so we can check the status | ||
VkSemaphoreTypeCreateInfo timelineCreateInfo = { | ||
/*sType*/ VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, | ||
/*pNext*/ nullptr, | ||
/*semaphoreType*/ VK_SEMAPHORE_TYPE_TIMELINE, | ||
/*initialValue*/ 0, | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Every time a compute call finishes, we'll increment the value | |
// of this semaphore by one, so we can check the status | |
VkSemaphoreTypeCreateInfo timelineCreateInfo = { | |
/*sType*/ VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, | |
/*pNext*/ nullptr, | |
/*semaphoreType*/ VK_SEMAPHORE_TYPE_TIMELINE, | |
/*initialValue*/ 0, | |
}; | |
// Every time a compute call finishes, we'll increment the value | |
// of this semaphore by one, so we can check the status. | |
VkSemaphoreTypeCreateInfo timelineCreateInfo = { | |
/*sType*/ VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, | |
/*pNext*/ nullptr, | |
/*semaphoreType*/ VK_SEMAPHORE_TYPE_TIMELINE, | |
/*initialValue*/ 0, | |
}; |
@@ -4659,6 +4660,12 @@ void RenderingDevice::sync() { | |||
local_device_processing = false; | |||
} | |||
|
|||
bool RenderingDevice::check_status(){ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bool RenderingDevice::check_status(){ | |
bool RenderingDevice::check_status() { |
@@ -2813,8 +2843,18 @@ void VulkanContext::local_device_sync(RID p_local_device) { | |||
|
|||
vkDeviceWaitIdle(ld->device); | |||
ld->waiting = false; | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an interesting PR but we might run into conflicts since we intend to merge #87340 and that changes the entire design around RD being able to create semaphores and fences freely. I'd recommend looking into it as it probably would save you quite a bit of work. That said I find the use of timeline semaphores interesting. These are not guaranteed to be supported so you must check for support I'm afraid, so they can't just be created without checking for that. But judging by the way they're being used in the PR, I get the feeling they might not be strictly necessary. If you feel the use of timeline semaphores will be required in the future we could probably take that into account in the redesign of the PR I mentioned (as they're guaranteed in D3D12 but in Vulkan they must be queried for support). That's why the RHI goes for the lowest common denominator. Do you think it might be possible to achieve the effect you're looking for without using them? |
I tried to find a way of doing this with a simple boolean semaphore, but from what I could read in the API, there's no non-blocking way of reading a boolean semaphore value. If there is, a lot of this complexity goes away, so I'd be happy for that. |
I'm going to take some time and see if I can get this to sit on top of #87340 Hopefully they play nicely together. |
Fences actually allow a non-blocking (although not free) way to check their status, so if you only want to check if a command buffer in particular is finished, that could be a pretty cheap way to go about it. Considering the fact a local device does not allow more than one simultaneous command buffer, if you have to wait for a fence you can assume all work assigned to the previous buffer must be complete already, so you wouldn't need a counter for it either. Keep in mind this is not me being strict for the sake of it, it's just that since Godot aims to be as compatible as possible, you might find some resistance in using features newer than 1.0 as they might not be available in the devices it aims to support. Therefore anything that needs a particular version supported must be explicitly checked and a suitable alternative path must be taken if support is not present (if possible). For something as core as this, if we can find a way to work around it the better.
Thanks! My suggestion mostly comes from the fact if it sits on top of the RHI, you won't need to re-implement it for all backends (since D3D12 is already in master and Metal is also coming at a later point). |
@DarioSamo I totally understand. This was just a first pass attempt to see what gets the wheels spinning in people's heads since I almost certainly don't have the best approach. Here's a bit of an architecture question. Are compute shaders run in the same command buffer as the render/swap chains? Because the use case for this PR is a compute shader that runs over multiple frames needing to be synchronized only after its work has completed. |
To clarify, it's more of an issue of vendor support (check gpuinfo.org) than Vulkan version. Pretty much all devices where Vulkan is worth using support the 1.1 specification, but they don't always support the extensions you wish to use. On desktop, 1.2 is pretty much everywhere (catering to core 1.1 is only needed on Android). If supporting old Intel IGPs or outdated graphics drivers isn't required, then core 1.3 is pretty much universal too. There's also the issue that we need to use the "lowest common denominator" that works across Vulkan, Direct3D 12 and (soon) Metal. A feature that is only available in Vulkan without counterparts available in other APIs is much more difficult to use (see the workaround that was required for specialization constants in Direct3D 12). |
That is currently the case. There's no architecture for allowing compute lists to run in parallel at the moment as resource tracking and the render graph are only guaranteed to work correctly when called from the main thread. Any work queued up from those is run in the command buffer used for rendering and is expected to finish before presenting to the swap chain. Therefore, if you queue up some expensive compute work, it will stall the rendering. The use case you describe can be implemented very easily on top of the PR I posted (and in fact #87590 takes an approach like that to implement asynchronous transfer queues that run in parallel to the main ones). I can't say however what the ideal interface for accessing this would be yet (we have a lot of thread restrictions to enforce and resource tracking would need to be kept local to the context). EDIT: What I said above doesn't apply as such to local device, which carries the cost of creating a new Vulkan device altogether but allows you to run stuff independently of the main rendering driver. However, if you intend to share resources in GPU memory, that is not the ideal approach. |
@DarioSamo I think I follow. The actual dispatch of the compute payload would need to be done in a separate command buffer altogether, which your PR allows. So since the use case here is that we don't actually want to transfer anything, just run a check that the compute workload is done, that should be able to be done atomically. In the linked project, you can see that I'm just checking every frame whether the workload is done, and if so, call Maybe my use case is too simplistic to even really offer many benefits, and my efforts may be better spent working with PR #87590 |
Your linked project uses local device, so it's a bit of a different story. On local devices, the command buffers are local to it and are not shared with the Rendering Device used for drawing graphics to the screen at all. Given your particular use case now that I see it, I don't think you'd need to use timeline semaphores at all. Your current logic could be implemented if you call vkGetFenceStatus on the vkFence object for a particular 'frame'. In the case of local devices, 'frames' only ever holds one frame, so you'd just be checking if the frame has finished executing without doing an active wait on it. Note that these frames have no actual relation to the frames in the queue of the main rendering device instance. My other suggestions mostly apply if you wish to look into a solution that uses the existing RenderingDevice without creating a local one, as that comes with extra cost and limitations for sharing resources between them. |
Ah okay. So "local device" here means that I'm creating my own local |
Yup. Therefore all resources you create there can't be shared with the Singleton version. That may or may not be useful to you (we use this on the offline lightmapper for example). |
Okay cool. I appreciate your notes. I'll try and see whether #87590 exposes what I'm looking for and whether I have anything to contribute. |
Is this still being actively worked on? |
@QuinnLeavitt Unfortunately I haven't had time to see how much work it would be to get this to work with #87590 . I would love to take another crack at this when I have some time. |
I have an implementation and a basic demo to show how to achieve Asynchronous Compute Shader execution with Vulkan Timeline Semaphores.
Currently Vulkan specific just to get the ball rolling
And here's a repurposed ray tracing project that demonstrates how my changes can avoid locking up the CPU while waiting for Compute Shaders to finish execution:
https://github.com/granitrocky/Raytracing_Godot4
RenderingDevice.buffer_get_data()
(hardware readbacks) godot-proposals#7886