Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions libc/utils/gpu/loader/amdgpu/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,34 +222,35 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
// Set up the packet for exeuction on the device. We currently only launch
// with one thread on the device, forcing the rest of the wavefront to be
// masked off.
std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
(params.num_blocks_z * params.num_threads_z != 1))
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
(params.num_blocks_z * params.num_threads_z != 1))
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet->workgroup_size_x = params.num_threads_x;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should keep the memset but offset it by the first 4 bytes so we don't need to worry about the reserved fields off of the packet. Unless it's very important that we write to these only once.

Copy link
Collaborator Author

@JonChesterfield JonChesterfield Oct 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To what end? The packet field assignments following are complete, and if they weren't, that would be a problem. It's probably only the first byte that is critical, but might be the first four bytes in practice.

packet->workgroup_size_y = params.num_threads_y;
packet->workgroup_size_z = params.num_threads_z;
packet->reserved0 = 0;
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
packet->private_segment_size = private_size;
packet->group_segment_size = group_size;
packet->kernel_object = kernel;
packet->kernarg_address = args;

packet->reserved2 = 0;
// Create a signal to indicate when this packet has been completed.
if (hsa_status_t err =
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
handle_error(err);

// Initialize the packet header and set the doorbell signal to begin execution
// by the HSA runtime.
uint16_t setup = packet->setup;
uint16_t header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
__atomic_store_n(&packet->header, header | (setup << 16), __ATOMIC_RELEASE);
uint32_t header_word =
header | (setup << 16u) __atomic_store_n((uint32_t *)&packet->header,
header_word, __ATOMIC_RELEASE);
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);

// Wait until the kernel has completed execution on the device. Periodically
Expand Down
16 changes: 8 additions & 8 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,8 +650,8 @@ struct AMDGPUQueueTy {
hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
assert(Packet && "Invalid packet");

// The header of the packet is written in the last moment.
Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
// The first 32 bits of the packet are written after the other fields
uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
Packet->workgroup_size_x = NumThreads;
Packet->workgroup_size_y = 1;
Packet->workgroup_size_z = 1;
Expand All @@ -667,7 +667,7 @@ struct AMDGPUQueueTy {
Packet->completion_signal = OutputSignal->get();

// Publish the packet. Do not modify the packet after this point.
publishKernelPacket(PacketId, Packet);
publishKernelPacket(PacketId, Setup, Packet);

return Plugin::success();
}
Expand Down Expand Up @@ -744,17 +744,17 @@ struct AMDGPUQueueTy {
/// Publish the kernel packet so that the HSA runtime can start processing
/// the kernel launch. Do not modify the packet once this function is called.
/// Assumes the queue lock is acquired.
void publishKernelPacket(uint64_t PacketId,
void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
hsa_kernel_dispatch_packet_t *Packet) {
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);

uint16_t Setup = Packet->setup;
uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;

// Publish the packet. Do not modify the package after this point.
__atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
uint32_t HeaderWord = Header | (Setup << 16u);
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);

// Signal the doorbell about the published packet.
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
Expand All @@ -766,14 +766,14 @@ struct AMDGPUQueueTy {
void publishBarrierPacket(uint64_t PacketId,
hsa_barrier_and_packet_t *Packet) {
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);

uint16_t Setup = 0;
uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;

// Publish the packet. Do not modify the package after this point.
__atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
uint32_t HeaderWord = Header | (Setup << 16u);
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);

// Signal the doorbell about the published packet.
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
Expand Down