diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index ba3e41ce3e5a8..a50699c68f65b 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -117,7 +117,6 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.time.nanosleep # gpu/rpc.h entrypoints - libc.src.gpu.rpc_reset libc.src.gpu.rpc_host_call ) diff --git a/libc/spec/gpu_ext.td b/libc/spec/gpu_ext.td index dca1e9f80f71e..dce81ff778620 100644 --- a/libc/spec/gpu_ext.td +++ b/libc/spec/gpu_ext.td @@ -5,11 +5,6 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> { [], // Types [], // Enumerations [ - FunctionSpec< - "rpc_reset", - RetValSpec, - [ArgSpec, ArgSpec] - >, FunctionSpec< "rpc_host_call", RetValSpec, diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h index fc95e5edf1c72..80bcd37753d22 100644 --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -88,20 +88,13 @@ template struct Process { static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8; cpp::Atomic lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0}; - /// Initialize the communication channels. - LIBC_INLINE void reset(uint32_t port_count, void *buffer) { - this->port_count = port_count; - this->inbox = reinterpret_cast *>( - advance(buffer, inbox_offset(port_count))); - this->outbox = reinterpret_cast *>( - advance(buffer, outbox_offset(port_count))); - this->packet = - reinterpret_cast(advance(buffer, buffer_offset(port_count))); - } - - /// Returns the beginning of the unified buffer. Intended for initializing the - /// client after the server has been started. - LIBC_INLINE void *get_buffer_start() const { return Invert ? outbox : inbox; } + LIBC_INLINE Process(uint32_t port_count, void *buffer) + : port_count(port_count), inbox(reinterpret_cast *>( + advance(buffer, inbox_offset(port_count)))), + outbox(reinterpret_cast *>( + advance(buffer, outbox_offset(port_count)))), + packet(reinterpret_cast( + advance(buffer, buffer_offset(port_count)))) {} /// Allocate a memory buffer sufficient to store the following equivalent /// representation in memory. @@ -116,13 +109,13 @@ template struct Process { } /// Retrieve the inbox state from memory shared between processes. - LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) { + LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const { return gpu::broadcast_value(lane_mask, inbox[index].load(cpp::MemoryOrder::RELAXED)); } /// Retrieve the outbox state from memory shared between processes. - LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) { + LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const { return gpu::broadcast_value(lane_mask, outbox[index].load(cpp::MemoryOrder::RELAXED)); } @@ -349,13 +342,12 @@ struct Client { LIBC_INLINE Client &operator=(const Client &) = delete; LIBC_INLINE ~Client() = default; + LIBC_INLINE Client(uint32_t port_count, void *buffer) + : process(port_count, buffer) {} + using Port = rpc::Port>; template LIBC_INLINE Port open(); - LIBC_INLINE void reset(uint32_t port_count, void *buffer) { - process.reset(port_count, buffer); - } - private: Process> process; }; @@ -371,18 +363,13 @@ template struct Server { LIBC_INLINE Server &operator=(const Server &) = delete; LIBC_INLINE ~Server() = default; + LIBC_INLINE Server(uint32_t port_count, void *buffer) + : process(port_count, buffer) {} + using Port = rpc::Port>; LIBC_INLINE cpp::optional try_open(); LIBC_INLINE Port open(); - LIBC_INLINE void reset(uint32_t port_count, void *buffer) { - process.reset(port_count, buffer); - } - - LIBC_INLINE void *get_buffer_start() const { - return process.get_buffer_start(); - } - LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) { return Process>::allocation_size(port_count); } diff --git a/libc/src/gpu/CMakeLists.txt b/libc/src/gpu/CMakeLists.txt index a0701c835bf46..e20228516b511 100644 --- a/libc/src/gpu/CMakeLists.txt +++ b/libc/src/gpu/CMakeLists.txt @@ -1,14 +1,3 @@ -add_entrypoint_object( - rpc_reset - SRCS - rpc_reset.cpp - HDRS - rpc_reset.h - DEPENDS - libc.src.__support.RPC.rpc_client - libc.src.__support.GPU.utils -) - add_entrypoint_object( rpc_host_call SRCS diff --git a/libc/src/gpu/rpc_reset.cpp b/libc/src/gpu/rpc_reset.cpp deleted file mode 100644 index ba5a097d1a1bc..0000000000000 --- a/libc/src/gpu/rpc_reset.cpp +++ /dev/null @@ -1,24 +0,0 @@ -//===---------- GPU implementation of the external RPC functionion --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/gpu/rpc_reset.h" - -#include "src/__support/GPU/utils.h" -#include "src/__support/RPC/rpc_client.h" -#include "src/__support/common.h" - -namespace __llvm_libc { - -// This is the external interface to initialize the RPC client with the -// shared buffer. -LLVM_LIBC_FUNCTION(void, rpc_reset, - (unsigned int num_ports, void *rpc_shared_buffer)) { - __llvm_libc::rpc::client.reset(num_ports, rpc_shared_buffer); -} - -} // namespace __llvm_libc diff --git a/libc/src/gpu/rpc_reset.h b/libc/src/gpu/rpc_reset.h deleted file mode 100644 index 5d6a6632760f8..0000000000000 --- a/libc/src/gpu/rpc_reset.h +++ /dev/null @@ -1,18 +0,0 @@ -//===-- Implementation header for RPC functions -----------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_GPU_RPC_H -#define LLVM_LIBC_SRC_GPU_RPC_H - -namespace __llvm_libc { - -void rpc_reset(unsigned int num_ports, void *buffer); - -} // namespace __llvm_libc - -#endif // LLVM_LIBC_SRC_GPU_RPC_H diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp index b2adb1d3abcaf..e6304ab243b8f 100644 --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -44,12 +44,7 @@ static void call_fini_array_callbacks() { } // namespace __llvm_libc extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void -_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) { - // We need to set up the RPC client first in case any of the constructors - // require it. - __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT, - rpc_shared_buffer); - +_begin(int argc, char **argv, char **env) { // We want the fini array callbacks to be run after other atexit // callbacks are run. So, we register them before running the init // array callbacks as they can potentially register their own atexit diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp index cd442394e74ce..d5d3ad2f15cac 100644 --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -42,12 +42,7 @@ static void call_fini_array_callbacks() { } // namespace __llvm_libc extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void -_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) { - // We need to set up the RPC client first in case any of the constructors - // require it. - __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT, - rpc_shared_buffer); - +_begin(int argc, char **argv, char **env) { // We want the fini array callbacks to be run after other atexit // callbacks are run. So, we register them before running the init // array callbacks as they can potentially register their own atexit diff --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp index 587ca8eb111f2..b575e01bcb9ba 100644 --- a/libc/test/src/__support/RPC/rpc_smoke_test.cpp +++ b/libc/test/src/__support/RPC/rpc_smoke_test.cpp @@ -33,13 +33,8 @@ alignas(64) char buffer[alloc_size] = {0}; TEST(LlvmLibcRPCSmoke, SanityCheck) { - ProcAType ProcA; - ProcBType ProcB; - - ProcA.reset(port_count, buffer); - ProcB.reset(port_count, buffer); - - EXPECT_EQ(ProcA.get_buffer_start(), ProcB.get_buffer_start()); + ProcAType ProcA(port_count, buffer); + ProcBType ProcB(port_count, buffer); uint64_t index = 0; // any < port_count uint64_t lane_mask = 1; diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h index 4eef88bf0463c..d2b2ee5baebed 100644 --- a/libc/utils/gpu/loader/Loader.h +++ b/libc/utils/gpu/loader/Loader.h @@ -34,7 +34,6 @@ struct begin_args_t { int argc; void *argv; void *envp; - void *rpc_shared_buffer; }; /// The arguments to the '_start' kernel. diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp index 9d732fe987da4..1d0247a6dc5dc 100644 --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -430,6 +430,49 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, else handle_error("Invalid wavefront size"); + // Initialize the RPC client on the device by copying the local data to the + // device's internal pointer. + hsa_executable_symbol_t rpc_client_sym; + if (hsa_status_t err = hsa_executable_get_symbol_by_name( + executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym)) + handle_error(err); + + void *rpc_client_host; + if (hsa_status_t err = + hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(void *), + /*flags=*/0, &rpc_client_host)) + handle_error(err); + + void *rpc_client_dev; + if (hsa_status_t err = hsa_executable_symbol_get_info( + rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, + &rpc_client_dev)) + handle_error(err); + + // Copy the address of the client buffer from the device to the host. + if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev, + dev_agent, sizeof(void *))) + handle_error(err); + + void *rpc_client_buffer; + if (hsa_status_t err = hsa_amd_memory_pool_allocate( + coarsegrained_pool, rpc_get_client_size(), + /*flags=*/0, &rpc_client_buffer)) + handle_error(err); + std::memcpy(rpc_client_buffer, rpc_get_client_buffer(device_id), + rpc_get_client_size()); + + // Copy the RPC client buffer to the address pointed to by the symbol. + if (hsa_status_t err = + hsa_memcpy(*reinterpret_cast(rpc_client_host), dev_agent, + rpc_client_buffer, host_agent, rpc_get_client_size())) + handle_error(err); + + if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_buffer)) + handle_error(err); + if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host)) + handle_error(err); + // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU. // If the clock_freq symbol is missing, no work to do. hsa_executable_symbol_t freq_sym; @@ -474,8 +517,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; - begin_args_t init_args = {argc, dev_argv, dev_envp, - rpc_get_buffer(device_id)}; + begin_args_t init_args = {argc, dev_argv, dev_envp}; if (hsa_status_t err = launch_kernel( dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, single_threaded_params, "_begin.kd", init_args)) diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp index 8b2132bc3c6e6..e920b65a7e10c 100644 --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -309,10 +309,25 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, warp_size, rpc_alloc, nullptr)) handle_error(err); + // Initialize the RPC client on the device by copying the local data to the + // device's internal pointer. + CUdeviceptr rpc_client_dev = 0; + uint64_t client_ptr_size = sizeof(void *); + if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size, + binary, rpc_client_symbol_name)) + handle_error(err); + + CUdeviceptr rpc_client_host = 0; + if (CUresult err = + cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *))) + handle_error(err); + if (CUresult err = + cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(device_id), + rpc_get_client_size())) + handle_error(err); + LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; - // Call the kernel to - begin_args_t init_args = {argc, dev_argv, dev_envp, - rpc_get_buffer(device_id)}; + begin_args_t init_args = {argc, dev_argv, dev_envp}; if (CUresult err = launch_kernel(binary, stream, single_threaded_params, "_begin", init_args)) handle_error(err); diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp index c98b9fa46ce05..ba58bf3cd6596 100644 --- a/libc/utils/gpu/server/rpc_server.cpp +++ b/libc/utils/gpu/server/rpc_server.cpp @@ -34,11 +34,6 @@ struct Server { Server(std::unique_ptr> &&server) : server(std::move(server)) {} - void reset(uint64_t port_count, void *buffer) { - std::visit([&](auto &server) { server->reset(port_count, buffer); }, - server); - } - uint64_t allocation_size(uint64_t port_count) { uint64_t ret = 0; std::visit([&](auto &server) { ret = server->allocation_size(port_count); }, @@ -46,12 +41,6 @@ struct Server { return ret; } - void *get_buffer_start() const { - void *ret = nullptr; - std::visit([&](auto &server) { ret = server->get_buffer_start(); }, server); - return ret; - } - rpc_status_t handle_server( std::unordered_map &callbacks, std::unordered_map &callback_data) { @@ -214,7 +203,9 @@ struct Server { struct Device { template - Device(std::unique_ptr &&server) : server(std::move(server)) {} + Device(uint32_t num_ports, void *buffer, std::unique_ptr &&server) + : buffer(buffer), server(std::move(server)), client(num_ports, buffer) {} + void *buffer; Server server; rpc::Client client; std::unordered_map callbacks; @@ -254,6 +245,24 @@ rpc_status_t rpc_shutdown(void) { return RPC_STATUS_SUCCESS; } +template +rpc_status_t server_init_impl(uint32_t device_id, uint64_t num_ports, + rpc_alloc_ty alloc, void *data) { + uint64_t size = rpc::Server::allocation_size(num_ports); + void *buffer = alloc(size, data); + + if (!buffer) + return RPC_STATUS_ERROR; + + state->devices[device_id] = std::make_unique( + num_ports, buffer, + std::make_unique>(num_ports, buffer)); + if (!state->devices[device_id]) + return RPC_STATUS_ERROR; + + return RPC_STATUS_SUCCESS; +} + rpc_status_t rpc_server_init(uint32_t device_id, uint64_t num_ports, uint32_t lane_size, rpc_alloc_ty alloc, void *data) { @@ -265,31 +274,26 @@ rpc_status_t rpc_server_init(uint32_t device_id, uint64_t num_ports, if (!state->devices[device_id]) { switch (lane_size) { case 1: - state->devices[device_id] = - std::make_unique(std::make_unique>()); + if (rpc_status_t err = + server_init_impl<1>(device_id, num_ports, alloc, data)) + return err; break; - case 32: - state->devices[device_id] = - std::make_unique(std::make_unique>()); + case 32: { + if (rpc_status_t err = + server_init_impl<32>(device_id, num_ports, alloc, data)) + return err; break; + } case 64: - state->devices[device_id] = - std::make_unique(std::make_unique>()); + if (rpc_status_t err = + server_init_impl<64>(device_id, num_ports, alloc, data)) + return err; break; default: return RPC_STATUS_INVALID_LANE_SIZE; } } - uint64_t size = state->devices[device_id]->server.allocation_size(num_ports); - void *buffer = alloc(size, data); - - if (!buffer) - return RPC_STATUS_ERROR; - - state->devices[device_id]->server.reset(num_ports, buffer); - state->devices[device_id]->client.reset(num_ports, buffer); - return RPC_STATUS_SUCCESS; } @@ -302,7 +306,7 @@ rpc_status_t rpc_server_shutdown(uint32_t device_id, rpc_free_ty dealloc, if (!state->devices[device_id]) return RPC_STATUS_ERROR; - dealloc(rpc_get_buffer(device_id), data); + dealloc(state->devices[device_id]->buffer, data); if (state->devices[device_id]) state->devices[device_id].release(); @@ -341,12 +345,6 @@ rpc_status_t rpc_register_callback(uint32_t device_id, rpc_opcode_t opcode, return RPC_STATUS_SUCCESS; } -void *rpc_get_buffer(uint32_t device_id) { - if (!state || device_id >= state->num_devices || !state->devices[device_id]) - return nullptr; - return state->devices[device_id]->server.get_buffer_start(); -} - const void *rpc_get_client_buffer(uint32_t device_id) { if (!state || device_id >= state->num_devices || !state->devices[device_id]) return nullptr; diff --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h index f4f4f31265843..2fde0001185c1 100644 --- a/libc/utils/gpu/server/rpc_server.h +++ b/libc/utils/gpu/server/rpc_server.h @@ -87,11 +87,8 @@ rpc_status_t rpc_handle_server(uint32_t device_id); rpc_status_t rpc_register_callback(uint32_t device_id, rpc_opcode_t opcode, rpc_opcode_callback_ty callback, void *data); -/// Obtain a pointer to the memory buffer used to run the RPC client and server. -void *rpc_get_buffer(uint32_t device_id); - /// Obtain a pointer to a local client buffer that can be copied directly to the -/// other process. +/// other process using the address stored at the rpc client symbol name. const void *rpc_get_client_buffer(uint32_t device_id); /// Returns the size of the client in bytes to be used for a memory copy.