diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index ba3e41ce3e5a8..a50699c68f65b 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -117,7 +117,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.time.nanosleep
 
     # gpu/rpc.h entrypoints
-    libc.src.gpu.rpc_reset
     libc.src.gpu.rpc_host_call
 )
 
diff --git a/libc/spec/gpu_ext.td b/libc/spec/gpu_ext.td
index dca1e9f80f71e..dce81ff778620 100644
--- a/libc/spec/gpu_ext.td
+++ b/libc/spec/gpu_ext.td
@@ -5,11 +5,6 @@ def GPUExtensions : StandardSpec<"GPUExtensions"> {
     [], // Types
     [], // Enumerations
     [
-        FunctionSpec<
-            "rpc_reset",
-            RetValSpec<VoidType>,
-            [ArgSpec<UnsignedIntType>, ArgSpec<VoidPtr>]
-        >,
         FunctionSpec<
             "rpc_host_call",
             RetValSpec<VoidType>,
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index fc95e5edf1c72..80bcd37753d22 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -88,20 +88,13 @@ template <bool Invert, typename Packet> struct Process {
   static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
   cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
 
-  /// Initialize the communication channels.
-  LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
-    this->port_count = port_count;
-    this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
-        advance(buffer, inbox_offset(port_count)));
-    this->outbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
-        advance(buffer, outbox_offset(port_count)));
-    this->packet =
-        reinterpret_cast<Packet *>(advance(buffer, buffer_offset(port_count)));
-  }
-
-  /// Returns the beginning of the unified buffer. Intended for initializing the
-  /// client after the server has been started.
-  LIBC_INLINE void *get_buffer_start() const { return Invert ? outbox : inbox; }
+  LIBC_INLINE Process(uint32_t port_count, void *buffer)
+      : port_count(port_count), inbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
+                                    advance(buffer, inbox_offset(port_count)))),
+        outbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
+            advance(buffer, outbox_offset(port_count)))),
+        packet(reinterpret_cast<Packet *>(
+            advance(buffer, buffer_offset(port_count)))) {}
 
   /// Allocate a memory buffer sufficient to store the following equivalent
   /// representation in memory.
@@ -116,13 +109,13 @@ template <bool Invert, typename Packet> struct Process {
   }
 
   /// Retrieve the inbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) {
+  LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
     return gpu::broadcast_value(lane_mask,
                                 inbox[index].load(cpp::MemoryOrder::RELAXED));
   }
 
   /// Retrieve the outbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) {
+  LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
     return gpu::broadcast_value(lane_mask,
                                 outbox[index].load(cpp::MemoryOrder::RELAXED));
   }
@@ -349,13 +342,12 @@ struct Client {
   LIBC_INLINE Client &operator=(const Client &) = delete;
   LIBC_INLINE ~Client() = default;
 
+  LIBC_INLINE Client(uint32_t port_count, void *buffer)
+      : process(port_count, buffer) {}
+
   using Port = rpc::Port<false, Packet<gpu::LANE_SIZE>>;
   template <uint16_t opcode> LIBC_INLINE Port open();
 
-  LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
-    process.reset(port_count, buffer);
-  }
-
 private:
   Process<false, Packet<gpu::LANE_SIZE>> process;
 };
@@ -371,18 +363,13 @@ template <uint32_t lane_size> struct Server {
   LIBC_INLINE Server &operator=(const Server &) = delete;
   LIBC_INLINE ~Server() = default;
 
+  LIBC_INLINE Server(uint32_t port_count, void *buffer)
+      : process(port_count, buffer) {}
+
   using Port = rpc::Port<true, Packet<lane_size>>;
   LIBC_INLINE cpp::optional<Port> try_open();
   LIBC_INLINE Port open();
 
-  LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
-    process.reset(port_count, buffer);
-  }
-
-  LIBC_INLINE void *get_buffer_start() const {
-    return process.get_buffer_start();
-  }
-
   LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) {
     return Process<true, Packet<lane_size>>::allocation_size(port_count);
   }
diff --git a/libc/src/gpu/CMakeLists.txt b/libc/src/gpu/CMakeLists.txt
index a0701c835bf46..e20228516b511 100644
--- a/libc/src/gpu/CMakeLists.txt
+++ b/libc/src/gpu/CMakeLists.txt
@@ -1,14 +1,3 @@
-add_entrypoint_object(
-  rpc_reset
-  SRCS
-    rpc_reset.cpp
-  HDRS
-    rpc_reset.h
-  DEPENDS
-    libc.src.__support.RPC.rpc_client
-    libc.src.__support.GPU.utils
-)
-
 add_entrypoint_object(
   rpc_host_call
   SRCS
diff --git a/libc/src/gpu/rpc_reset.cpp b/libc/src/gpu/rpc_reset.cpp
deleted file mode 100644
index ba5a097d1a1bc..0000000000000
--- a/libc/src/gpu/rpc_reset.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===---------- GPU implementation of the external RPC functionion --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/gpu/rpc_reset.h"
-
-#include "src/__support/GPU/utils.h"
-#include "src/__support/RPC/rpc_client.h"
-#include "src/__support/common.h"
-
-namespace __llvm_libc {
-
-// This is the external interface to initialize the RPC client with the
-// shared buffer.
-LLVM_LIBC_FUNCTION(void, rpc_reset,
-                   (unsigned int num_ports, void *rpc_shared_buffer)) {
-  __llvm_libc::rpc::client.reset(num_ports, rpc_shared_buffer);
-}
-
-} // namespace __llvm_libc
diff --git a/libc/src/gpu/rpc_reset.h b/libc/src/gpu/rpc_reset.h
deleted file mode 100644
index 5d6a6632760f8..0000000000000
--- a/libc/src/gpu/rpc_reset.h
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- Implementation header for RPC functions -----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_GPU_RPC_H
-#define LLVM_LIBC_SRC_GPU_RPC_H
-
-namespace __llvm_libc {
-
-void rpc_reset(unsigned int num_ports, void *buffer);
-
-} // namespace __llvm_libc
-
-#endif // LLVM_LIBC_SRC_GPU_RPC_H
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index b2adb1d3abcaf..e6304ab243b8f 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -44,12 +44,7 @@ static void call_fini_array_callbacks() {
 } // namespace __llvm_libc
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
-_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
-  // We need to set up the RPC client first in case any of the constructors
-  // require it.
-  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
-                                 rpc_shared_buffer);
-
+_begin(int argc, char **argv, char **env) {
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
   // array callbacks as they can potentially register their own atexit
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index cd442394e74ce..d5d3ad2f15cac 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -42,12 +42,7 @@ static void call_fini_array_callbacks() {
 } // namespace __llvm_libc
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
-  // We need to set up the RPC client first in case any of the constructors
-  // require it.
-  __llvm_libc::rpc::client.reset(__llvm_libc::rpc::MAX_PORT_COUNT,
-                                 rpc_shared_buffer);
-
+_begin(int argc, char **argv, char **env) {
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
   // array callbacks as they can potentially register their own atexit
diff --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
index 587ca8eb111f2..b575e01bcb9ba 100644
--- a/libc/test/src/__support/RPC/rpc_smoke_test.cpp
+++ b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
@@ -33,13 +33,8 @@ alignas(64) char buffer[alloc_size] = {0};
 
 TEST(LlvmLibcRPCSmoke, SanityCheck) {
 
-  ProcAType ProcA;
-  ProcBType ProcB;
-
-  ProcA.reset(port_count, buffer);
-  ProcB.reset(port_count, buffer);
-
-  EXPECT_EQ(ProcA.get_buffer_start(), ProcB.get_buffer_start());
+  ProcAType ProcA(port_count, buffer);
+  ProcBType ProcB(port_count, buffer);
 
   uint64_t index = 0; // any < port_count
   uint64_t lane_mask = 1;
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index 4eef88bf0463c..d2b2ee5baebed 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -34,7 +34,6 @@ struct begin_args_t {
   int argc;
   void *argv;
   void *envp;
-  void *rpc_shared_buffer;
 };
 
 /// The arguments to the '_start' kernel.
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index 9d732fe987da4..1d0247a6dc5dc 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -430,6 +430,49 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   else
     handle_error("Invalid wavefront size");
 
+  // Initialize the RPC client on the device by copying the local data to the
+  // device's internal pointer.
+  hsa_executable_symbol_t rpc_client_sym;
+  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
+          executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
+    handle_error(err);
+
+  void *rpc_client_host;
+  if (hsa_status_t err =
+          hsa_amd_memory_pool_allocate(coarsegrained_pool, sizeof(void *),
+                                       /*flags=*/0, &rpc_client_host))
+    handle_error(err);
+
+  void *rpc_client_dev;
+  if (hsa_status_t err = hsa_executable_symbol_get_info(
+          rpc_client_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
+          &rpc_client_dev))
+    handle_error(err);
+
+  // Copy the address of the client buffer from the device to the host.
+  if (hsa_status_t err = hsa_memcpy(rpc_client_host, host_agent, rpc_client_dev,
+                                    dev_agent, sizeof(void *)))
+    handle_error(err);
+
+  void *rpc_client_buffer;
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+          coarsegrained_pool, rpc_get_client_size(),
+          /*flags=*/0, &rpc_client_buffer))
+    handle_error(err);
+  std::memcpy(rpc_client_buffer, rpc_get_client_buffer(device_id),
+              rpc_get_client_size());
+
+  // Copy the RPC client buffer to the address pointed to by the symbol.
+  if (hsa_status_t err =
+          hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
+                     rpc_client_buffer, host_agent, rpc_get_client_size()))
+    handle_error(err);
+
+  if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_buffer))
+    handle_error(err);
+  if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
+    handle_error(err);
+
   // Obtain the GPU's fixed-frequency clock rate and copy it to the GPU.
   // If the clock_freq symbol is missing, no work to do.
   hsa_executable_symbol_t freq_sym;
@@ -474,8 +517,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
 
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
-  begin_args_t init_args = {argc, dev_argv, dev_envp,
-                            rpc_get_buffer(device_id)};
+  begin_args_t init_args = {argc, dev_argv, dev_envp};
   if (hsa_status_t err = launch_kernel(
           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
           single_threaded_params, "_begin.kd", init_args))
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 8b2132bc3c6e6..e920b65a7e10c 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -309,10 +309,25 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
                                          warp_size, rpc_alloc, nullptr))
     handle_error(err);
 
+  // Initialize the RPC client on the device by copying the local data to the
+  // device's internal pointer.
+  CUdeviceptr rpc_client_dev = 0;
+  uint64_t client_ptr_size = sizeof(void *);
+  if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
+                                       binary, rpc_client_symbol_name))
+    handle_error(err);
+
+  CUdeviceptr rpc_client_host = 0;
+  if (CUresult err =
+          cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *)))
+    handle_error(err);
+  if (CUresult err =
+          cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(device_id),
+                       rpc_get_client_size()))
+    handle_error(err);
+
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
-  // Call the kernel to
-  begin_args_t init_args = {argc, dev_argv, dev_envp,
-                            rpc_get_buffer(device_id)};
+  begin_args_t init_args = {argc, dev_argv, dev_envp};
   if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
                                    "_begin", init_args))
     handle_error(err);
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index c98b9fa46ce05..ba58bf3cd6596 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -34,11 +34,6 @@ struct Server {
   Server(std::unique_ptr<rpc::Server<lane_size>> &&server)
       : server(std::move(server)) {}
 
-  void reset(uint64_t port_count, void *buffer) {
-    std::visit([&](auto &server) { server->reset(port_count, buffer); },
-               server);
-  }
-
   uint64_t allocation_size(uint64_t port_count) {
     uint64_t ret = 0;
     std::visit([&](auto &server) { ret = server->allocation_size(port_count); },
@@ -46,12 +41,6 @@ struct Server {
     return ret;
   }
 
-  void *get_buffer_start() const {
-    void *ret = nullptr;
-    std::visit([&](auto &server) { ret = server->get_buffer_start(); }, server);
-    return ret;
-  }
-
   rpc_status_t handle_server(
       std::unordered_map<rpc_opcode_t, rpc_opcode_callback_ty> &callbacks,
       std::unordered_map<rpc_opcode_t, void *> &callback_data) {
@@ -214,7 +203,9 @@ struct Server {
 
 struct Device {
   template <typename T>
-  Device(std::unique_ptr<T> &&server) : server(std::move(server)) {}
+  Device(uint32_t num_ports, void *buffer, std::unique_ptr<T> &&server)
+      : buffer(buffer), server(std::move(server)), client(num_ports, buffer) {}
+  void *buffer;
   Server server;
   rpc::Client client;
   std::unordered_map<rpc_opcode_t, rpc_opcode_callback_ty> callbacks;
@@ -254,6 +245,24 @@ rpc_status_t rpc_shutdown(void) {
   return RPC_STATUS_SUCCESS;
 }
 
+template <uint32_t lane_size>
+rpc_status_t server_init_impl(uint32_t device_id, uint64_t num_ports,
+                              rpc_alloc_ty alloc, void *data) {
+  uint64_t size = rpc::Server<lane_size>::allocation_size(num_ports);
+  void *buffer = alloc(size, data);
+
+  if (!buffer)
+    return RPC_STATUS_ERROR;
+
+  state->devices[device_id] = std::make_unique<Device>(
+      num_ports, buffer,
+      std::make_unique<rpc::Server<lane_size>>(num_ports, buffer));
+  if (!state->devices[device_id])
+    return RPC_STATUS_ERROR;
+
+  return RPC_STATUS_SUCCESS;
+}
+
 rpc_status_t rpc_server_init(uint32_t device_id, uint64_t num_ports,
                              uint32_t lane_size, rpc_alloc_ty alloc,
                              void *data) {
@@ -265,31 +274,26 @@ rpc_status_t rpc_server_init(uint32_t device_id, uint64_t num_ports,
   if (!state->devices[device_id]) {
     switch (lane_size) {
     case 1:
-      state->devices[device_id] =
-          std::make_unique<Device>(std::make_unique<rpc::Server<1>>());
+      if (rpc_status_t err =
+              server_init_impl<1>(device_id, num_ports, alloc, data))
+        return err;
       break;
-    case 32:
-      state->devices[device_id] =
-          std::make_unique<Device>(std::make_unique<rpc::Server<32>>());
+    case 32: {
+      if (rpc_status_t err =
+              server_init_impl<32>(device_id, num_ports, alloc, data))
+        return err;
       break;
+    }
     case 64:
-      state->devices[device_id] =
-          std::make_unique<Device>(std::make_unique<rpc::Server<64>>());
+      if (rpc_status_t err =
+              server_init_impl<64>(device_id, num_ports, alloc, data))
+        return err;
       break;
     default:
       return RPC_STATUS_INVALID_LANE_SIZE;
     }
   }
 
-  uint64_t size = state->devices[device_id]->server.allocation_size(num_ports);
-  void *buffer = alloc(size, data);
-
-  if (!buffer)
-    return RPC_STATUS_ERROR;
-
-  state->devices[device_id]->server.reset(num_ports, buffer);
-  state->devices[device_id]->client.reset(num_ports, buffer);
-
   return RPC_STATUS_SUCCESS;
 }
 
@@ -302,7 +306,7 @@ rpc_status_t rpc_server_shutdown(uint32_t device_id, rpc_free_ty dealloc,
   if (!state->devices[device_id])
     return RPC_STATUS_ERROR;
 
-  dealloc(rpc_get_buffer(device_id), data);
+  dealloc(state->devices[device_id]->buffer, data);
   if (state->devices[device_id])
     state->devices[device_id].release();
 
@@ -341,12 +345,6 @@ rpc_status_t rpc_register_callback(uint32_t device_id, rpc_opcode_t opcode,
   return RPC_STATUS_SUCCESS;
 }
 
-void *rpc_get_buffer(uint32_t device_id) {
-  if (!state || device_id >= state->num_devices || !state->devices[device_id])
-    return nullptr;
-  return state->devices[device_id]->server.get_buffer_start();
-}
-
 const void *rpc_get_client_buffer(uint32_t device_id) {
   if (!state || device_id >= state->num_devices || !state->devices[device_id])
     return nullptr;
diff --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h
index f4f4f31265843..2fde0001185c1 100644
--- a/libc/utils/gpu/server/rpc_server.h
+++ b/libc/utils/gpu/server/rpc_server.h
@@ -87,11 +87,8 @@ rpc_status_t rpc_handle_server(uint32_t device_id);
 rpc_status_t rpc_register_callback(uint32_t device_id, rpc_opcode_t opcode,
                                    rpc_opcode_callback_ty callback, void *data);
 
-/// Obtain a pointer to the memory buffer used to run the RPC client and server.
-void *rpc_get_buffer(uint32_t device_id);
-
 /// Obtain a pointer to a local client buffer that can be copied directly to the
-/// other process.
+/// other process using the address stored at the rpc client symbol name.
 const void *rpc_get_client_buffer(uint32_t device_id);
 
 /// Returns the size of the client in bytes to be used for a memory copy.