EnzymeAD · Feb 21, 2025
diff --git a/‎deps/ReactantExtra/API.cpp
Lines changed: 131 additions & 95 deletions b/‎deps/ReactantExtra/API.cpp
Lines changed: 131 additions & 95 deletions
diff --git a/‎src/xla/Client.jl
Lines changed: 0 additions & 52 deletions b/‎src/xla/Client.jl
Lines changed: 0 additions & 52 deletions
diff --git a/‎src/xla/IFRT/Client.jl
Lines changed: 113 additions & 8 deletions b/‎src/xla/IFRT/Client.jl
Lines changed: 113 additions & 8 deletions
diff --git a/‎src/xla/PJRT/Client.jl
Lines changed: 76 additions & 5 deletions b/‎src/xla/PJRT/Client.jl
Lines changed: 76 additions & 5 deletions
diff --git a/‎src/xla/XLA.jl
Lines changed: 31 additions & 24 deletions b/‎src/xla/XLA.jl
Lines changed: 31 additions & 24 deletions
@@ -342,14 +342,12 @@ extern "C" void ProfilerServerStop(tsl::profiler::ProfilerServer *server) {
   delete server;
 }
 
-extern "C" PjRtClient *MakeCPUClient(uint8_t asynchronous, int node_id,
-                                     int num_nodes) {
+extern "C" PjRtClient *MakeCPUClient(uint8_t asynchronous, int node_id) {
   CpuClientOptions options;
-  // options.kv_store = "etcd";
+
   options.process_id = node_id;
-  // options.num_nodes = num_nodes;
-  // options.collectives = num_nodes;
   options.asynchronous = asynchronous != 0;
+
   auto client = MyValueOrThrow(GetTfrtCpuClient(options));
   return client.release();
 }
@@ -1271,28 +1269,6 @@ extern "C" MlirOperation LinkInModule(MlirModule prevModC, MlirModule newModC,
   return wrap(entryFn);
 }
 
-extern "C" HeldPjRtClient *
-pjrt_make_cpu_client_shared(uint8_t asynchronous, int node_id, int num_nodes) {
-  PjRtClient *client = MakeCPUClient(asynchronous, node_id, num_nodes);
-  return reactant::capture(std::shared_ptr<PjRtClient>(client));
-}
-
-extern "C" HeldPjRtClient *pjrt_make_gpu_client_shared(
-    int node_id, int num_nodes, int *allowed_devices, int num_allowed_devices,
-    double memory_fraction, bool preallocate, const char *platform_name,
-    const char **error, void *distributed_runtime_client) {
-  PjRtClient *client = MakeGPUClient(
-      node_id, num_nodes, allowed_devices, num_allowed_devices, memory_fraction,
-      preallocate, platform_name, error, distributed_runtime_client);
-  return reactant::capture(std::shared_ptr<PjRtClient>(client));
-}
-
-extern "C" HeldPjRtClient *pjrt_make_tpu_client_shared(const char *tpu_path,
-                                                       const char **error) {
-  PjRtClient *client = MakeTPUClient(tpu_path, error);
-  return reactant::capture(std::shared_ptr<PjRtClient>(client));
-}
-
 extern "C" void pjrt_client_dtor(HeldPjRtClient *client) { delete client; }
 
 extern "C" int pjrt_client_num_devices(HeldPjRtClient *client) {
@@ -1369,11 +1345,6 @@ extern "C" HeldPjRtClient *pjrt_buffer_get_client(HeldPjRtBuffer *buffer) {
       std::shared_ptr<PjRtClient>(buffer->ptr()->client()));
 }
 
-extern "C" ifrt::Client *ifrt_pjrt_make_client(HeldPjRtClient *pjrt_client) {
-  xla::ifrt::PjRtClient::CreateOptions options = {pjrt_client->obj()};
-  return MyValueOrThrow(xla::ifrt::PjRtClient::Create(options)).release();
-}
-
 extern "C" void ifrt_client_dtor(ifrt::Client *client) { delete client; }
 
 // generic version, but IFRT-PjRt backend only supports SingleDeviceSharding
@@ -1575,61 +1546,62 @@ FreeHloModule(HeldValue<std::shared_ptr<xla::HloModule>> *hlo_module) {
 
 #pragma region IfRtClient
 
-extern "C" ifrt::proxy::GrpcServer *
-ifrt_proxy_grpc_server_create_from_ifrt_client_factory_cpu(
-    const char *c_address, uint8_t asynchronous, int node_id, int num_nodes) {
-  std::string address = c_address;
-
-  return MyValueOrThrow(
-             ifrt::proxy::GrpcServer::CreateFromIfrtClientFactory(
-                 address,
-                 [asynchronous, node_id, num_nodes]()
-                     -> absl::StatusOr<std::shared_ptr<ifrt::Client>> {
-                   auto pjrt_client = std::shared_ptr<PjRtClient>(
-                       MakeCPUClient(asynchronous, node_id, num_nodes));
-                   return std::shared_ptr<ifrt::Client>(
-                       xla::ifrt::PjRtClient::Create(pjrt_client).release());
-                 }))
-      .release();
-}
-
-extern "C" ifrt::proxy::GrpcServer *
-ifrt_proxy_grpc_server_create_from_ifrt_client_factory_gpu(
-    int node_id, int num_nodes, int *allowed_devices, int num_allowed_devices,
-    double memory_fraction, bool preallocate, const char *platform_name,
-    const char **error) {
-  return MyValueOrThrow(
-             ifrt::proxy::GrpcServer::CreateFromIfrtClientFactory(
-                 std::string(),
-                 [node_id, num_nodes, allowed_devices, num_allowed_devices,
-                  memory_fraction, preallocate, platform_name,
-                  error]() -> absl::StatusOr<std::shared_ptr<ifrt::Client>> {
-                   auto pjrt_client = std::shared_ptr<PjRtClient>(MakeGPUClient(
-                       node_id, num_nodes, allowed_devices, num_allowed_devices,
-                       memory_fraction, preallocate, platform_name, error));
-                   return std::shared_ptr<ifrt::Client>(
-                       xla::ifrt::PjRtClient::Create(pjrt_client).release());
-                 }))
-      .release();
-}
+// XXX: Bring back with the correct API
+// extern "C" ifrt::proxy::GrpcServer *
+// ifrt_proxy_grpc_server_create_from_ifrt_client_factory_cpu(
+//     const char *c_address, uint8_t asynchronous, int node_id) {
+//   std::string address = c_address;
+
+//   return MyValueOrThrow(
+//              ifrt::proxy::GrpcServer::CreateFromIfrtClientFactory(
+//                  address,
+//                  [asynchronous,
+//                   node_id]() -> absl::StatusOr<std::shared_ptr<ifrt::Client>> {
+//                    auto pjrt_client = std::shared_ptr<PjRtClient>(
+//                        MakeCPUClient(asynchronous, node_id));
+//                    return std::shared_ptr<ifrt::Client>(
+//                        xla::ifrt::PjRtClient::Create(pjrt_client).release());
+//                  }))
+//       .release();
+// }
 
-extern "C" ifrt::proxy::GrpcServer *
-ifrt_proxy_grpc_server_create_from_ifrt_client_factory_tpu(
-    const char *c_address, const char *tpu_path, const char **error) {
-  std::string address = c_address;
+// extern "C" ifrt::proxy::GrpcServer *
+// ifrt_proxy_grpc_server_create_from_ifrt_client_factory_gpu(
+//     int node_id, int num_nodes, int *allowed_devices, int num_allowed_devices,
+//     double memory_fraction, bool preallocate, const char *platform_name,
+//     const char **error) {
+//   return MyValueOrThrow(
+//              ifrt::proxy::GrpcServer::CreateFromIfrtClientFactory(
+//                  std::string(),
+//                  [node_id, num_nodes, allowed_devices, num_allowed_devices,
+//                   memory_fraction, preallocate, platform_name,
+//                   error]() -> absl::StatusOr<std::shared_ptr<ifrt::Client>> {
+//                    auto pjrt_client = std::shared_ptr<PjRtClient>(MakeGPUClient(
+//                        node_id, num_nodes, allowed_devices, num_allowed_devices,
+//                        memory_fraction, preallocate, platform_name, error));
+//                    return std::shared_ptr<ifrt::Client>(
+//                        xla::ifrt::PjRtClient::Create(pjrt_client).release());
+//                  }))
+//       .release();
+// }
 
-  return MyValueOrThrow(
-             xla::ifrt::proxy::GrpcServer::CreateFromIfrtClientFactory(
-                 address,
-                 [tpu_path, error]()
-                     -> absl::StatusOr<std::shared_ptr<xla::ifrt::Client>> {
-                   auto pjrt_client = std::shared_ptr<xla::PjRtClient>(
-                       MakeTPUClient(tpu_path, error));
-                   return std::shared_ptr<xla::ifrt::Client>(
-                       xla::ifrt::PjRtClient::Create(pjrt_client).release());
-                 }))
-      .release();
-}
+// extern "C" ifrt::proxy::GrpcServer *
+// ifrt_proxy_grpc_server_create_from_ifrt_client_factory_tpu(
+//     const char *c_address, const char *tpu_path, const char **error) {
+//   std::string address = c_address;
+
+//   return MyValueOrThrow(
+//              xla::ifrt::proxy::GrpcServer::CreateFromIfrtClientFactory(
+//                  address,
+//                  [tpu_path, error]()
+//                      -> absl::StatusOr<std::shared_ptr<xla::ifrt::Client>> {
+//                    auto pjrt_client = std::shared_ptr<xla::PjRtClient>(
+//                        MakeTPUClient(tpu_path, error));
+//                    return std::shared_ptr<xla::ifrt::Client>(
+//                        xla::ifrt::PjRtClient::Create(pjrt_client).release());
+//                  }))
+//       .release();
+// }
 
 extern "C" void ifrt_proxy_grpc_server_dtor(ifrt::proxy::GrpcServer *server) {
   delete server;
@@ -1662,24 +1634,88 @@ ifrt_proxy_create_client(const char *c_proxy_server_address,
       .release();
 }
 
-extern "C" ifrt::Client *ifrt_make_pjrt_cpu_client(uint8_t asynchronous,
-                                                   int node_id, int num_nodes) {
-  return ifrt_pjrt_make_client(
-      pjrt_make_cpu_client_shared(asynchronous, node_id, num_nodes));
+extern "C" ifrt::Client *ifrt_pjrt_make_client(HeldPjRtClient *pjrt_client,
+                                               int node_id, int num_nodes,
+                                               void *distributed_runtime_client,
+                                               const char **error,
+                                               std::string key_prefix) {
+  ifrt::PjRtClient::CreateOptions options;
+  options.pjrt_client = pjrt_client->obj();
+
+  if (num_nodes > 1) {
+    if (distributed_runtime_client == nullptr) {
+      *error =
+          "`distributed_runtime_client` must be non-null if `num_nodes` > 1";
+      return nullptr;
+    }
+    auto typed_distributed_runtime_client = static_cast<
+        HeldValue<std::shared_ptr<xla::DistributedRuntimeClient>> *>(
+        distributed_runtime_client);
+    options.kv_store = GetDistributedKeyValueStore(
+        typed_distributed_runtime_client->obj(), key_prefix);
+  }
+
+  options.process_id = node_id;
+  options.num_processes = num_nodes;
+
+  return MyValueOrThrow(xla::ifrt::PjRtClient::Create(options)).release();
+}
+
+extern "C" HeldPjRtClient *pjrt_make_cpu_client_shared(uint8_t asynchronous,
+                                                       int node_id) {
+  PjRtClient *client = MakeCPUClient(asynchronous, node_id);
+  return reactant::capture(std::shared_ptr<PjRtClient>(client));
+}
+
+extern "C" ifrt::Client *
+ifrt_make_pjrt_cpu_client(uint8_t asynchronous, int node_id, int num_nodes,
+                          void *distributed_runtime_client,
+                          const char **error) {
+  HeldPjRtClient *pjrt_client =
+      pjrt_make_cpu_client_shared(asynchronous, node_id);
+  if (pjrt_client == nullptr)
+    return nullptr;
+  return ifrt_pjrt_make_client(pjrt_client, node_id, num_nodes,
+                               distributed_runtime_client, error, "cpu");
+}
+
+extern "C" HeldPjRtClient *pjrt_make_gpu_client_shared(
+    int node_id, int num_nodes, int *allowed_devices, int num_allowed_devices,
+    double memory_fraction, bool preallocate, const char *platform_name,
+    const char **error, void *distributed_runtime_client) {
+  PjRtClient *client = MakeGPUClient(
+      node_id, num_nodes, allowed_devices, num_allowed_devices, memory_fraction,
+      preallocate, platform_name, error, distributed_runtime_client);
+  return reactant::capture(std::shared_ptr<PjRtClient>(client));
 }
 
 extern "C" ifrt::Client *ifrt_make_pjrt_gpu_client(
     int node_id, int num_nodes, int *allowed_devices, int num_allowed_devices,
     double memory_fraction, bool preallocate, const char *platform_name,
     const char **error, void *distributed_runtime_client) {
-  return ifrt_pjrt_make_client(pjrt_make_gpu_client_shared(
+  HeldPjRtClient *pjrt_client = pjrt_make_gpu_client_shared(
       node_id, num_nodes, allowed_devices, num_allowed_devices, memory_fraction,
-      preallocate, platform_name, error, distributed_runtime_client));
+      preallocate, platform_name, error, distributed_runtime_client);
+  if (pjrt_client == nullptr)
+    return nullptr;
+  return ifrt_pjrt_make_client(pjrt_client, node_id, num_nodes,
+                               distributed_runtime_client, error, "gpu");
+}
+
+extern "C" HeldPjRtClient *pjrt_make_tpu_client_shared(const char *tpu_path,
+                                                       const char **error) {
+  PjRtClient *client = MakeTPUClient(tpu_path, error);
+  return reactant::capture(std::shared_ptr<PjRtClient>(client));
 }
 
-extern "C" ifrt::Client *ifrt_make_pjrt_tpu_client(const char *tpu_path,
-                                                   const char **error) {
-  return ifrt_pjrt_make_client(pjrt_make_tpu_client_shared(tpu_path, error));
+extern "C" ifrt::Client *
+ifrt_make_pjrt_tpu_client(const char *tpu_path, const char **error, int node_id,
+                          int num_nodes, void *distributed_runtime_client) {
+  HeldPjRtClient *pjrt_client = pjrt_make_tpu_client_shared(tpu_path, error);
+  if (pjrt_client == nullptr)
+    return nullptr;
+  return ifrt_pjrt_make_client(pjrt_client, node_id, num_nodes,
+                               distributed_runtime_client, error, "tpu");
 }
 
 extern "C" void ifrt_FreeClient(ifrt::Client *client) { delete client; }
@@ -1943,7 +1979,7 @@ extern "C" void ifrt_array_copy_to_host_buffer(HeldIfrtArray *array,
 
 #pragma endregion
 
-#pragma region PjRtDistributed
+#pragma region xla::Distributed
 
 extern "C" HeldValue<std::shared_ptr<xla::DistributedRuntimeClient>> *
 GetDistributedRuntimeClient(char *c_address, int32_t node_id,
 
@@ -14,55 +14,3 @@ function get_addressable_device end
 function platform_name end
 
 default_device(client::AbstractClient) = first(addressable_devices(client))
-
-# Clients for Different Backends
-function CPUClient(cfunc, node_id=0, num_nodes=1; asynchronous=true)
-    f = Libdl.dlsym(Reactant_jll.libReactantExtra_handle, string(cfunc))
-    client = ccall(f, Ptr{Cvoid}, (UInt, Cint, Cint), asynchronous, node_id, num_nodes)
-    LLVMclopts("-nvptx-fma-level=1")
-    return client
-end
-
-function GPUClient(
-    cfunc,
-    node_id=0,
-    num_nodes=1,
-    platform="gpu";
-    allowed_devices::Union{Nothing,Vector{Int}}=nothing,
-    distributed_runtime_client::Union{Nothing,DistributedRuntimeClient}=nothing,
-)
-    f = Libdl.dlsym(Reactant_jll.libReactantExtra_handle, string(cfunc))
-    refstr = Ref{Cstring}()
-
-    num_allowed_devices = allowed_devices === nothing ? 0 : length(allowed_devices)
-    allowed_devices = allowed_devices === nothing ? C_NULL : allowed_devices
-    distributed_runtime_client =
-        distributed_runtime_client === nothing ? C_NULL : distributed_runtime_client.client
-
-    client = ccall(
-        f,
-        Ptr{Cvoid},
-        (Cint, Cint, Ptr{Cvoid}, Cint, Cdouble, Bool, Cstring, Ptr{Cstring}, Ptr{Cvoid}),
-        node_id,
-        num_nodes,
-        allowed_devices,
-        num_allowed_devices,
-        XLA_REACTANT_GPU_MEM_FRACTION[],
-        false,
-        platform,
-        refstr,
-        distributed_runtime_client,
-    )
-    client == C_NULL && throw(AssertionError(unsafe_string(refstr[])))
-    LLVMclopts("-nvptx-fma-level=1")
-    return client
-end
-
-function TPUClient(cfunc, tpu_path::String)
-    f = Libdl.dlsym(Reactant_jll.libReactantExtra_handle, string(cfunc))
-    refstr = Ref{Cstring}()
-    client = ccall(f, Ptr{Cvoid}, (Cstring, Ptr{Cstring}), tpu_path, refstr)
-    client == C_NULL && throw(AssertionError(unsafe_string(refstr[])))
-    LLVMclopts("-nvptx-fma-level=1")
-    return client
-end
@@ -1,8 +1,8 @@
 mutable struct Client <: XLA.AbstractClient
     client::Ptr{Cvoid}
 
-    function Client(client::Ptr{Cvoid})
-        @assert client != C_NULL
+    function Client(client::Ptr{Cvoid}; skip_check::Bool=false)
+        skip_check || (@assert client != C_NULL)
         return new(client)
     end
 end
@@ -66,22 +66,127 @@ function XLA.platform_name(client::Client)
     return XLA.unsafe_string_and_free(str)
 end
 
+function XLA.devices(client::Client)
+    ndevices = Int(XLA.num_devices(client))
+    devices = Ref{NTuple{ndevices,Ptr{Cvoid}}}()
+    GC.@preserve client devices begin
+        @ccall MLIR.API.mlir_c.ifrt_client_devices(
+            client.client::Ptr{Cvoid}, devices::Ptr{Ptr{Cvoid}}
+        )::Cvoid
+    end
+    return [Device(device) for device in devices[]]
+end
+
+function XLA.addressable_devices(client::Client)
+    naddressable_devices = Int(XLA.num_addressable_devices(client))
+    addressable_devices = Ref{NTuple{naddressable_devices,Ptr{Cvoid}}}()
+    GC.@preserve client addressable_devices begin
+        @ccall MLIR.API.mlir_c.ifrt_client_addressable_devices(
+            client.client::Ptr{Cvoid}, addressable_devices::Ptr{Ptr{Cvoid}}
+        )::Cvoid
+    end
+    return [Device(device) for device in addressable_devices[]]
+end
+
 # Different Backends
 const cpu_client_count = Ref(0)
 const gpu_client_count = Ref(0)
 const tpu_client_count = Ref(0)
 
-# XXX: We need other backends to support sharding
-for (backend, fname, counter) in (
-    (:CPUClient, "ifrt_make_pjrt_cpu_client", :cpu_client_count),
-    (:GPUClient, "ifrt_make_pjrt_gpu_client", :gpu_client_count),
-    (:TPUClient, "ifrt_make_pjrt_tpu_client", :tpu_client_count),
+for (backend, counter) in (
+    (:CPUClient, :cpu_client_count),
+    (:GPUClient, :gpu_client_count),
+    (:TPUClient, :tpu_client_count),
 )
+    main_fn = Symbol(:MakeIFRTPJRT, backend)
     @eval function $(backend)(args...; checkcount::Bool=true, kwargs...)
         if checkcount
             @assert $(counter)[] == 0
+        end
+        client, refstr = $(main_fn)(args...; kwargs...)
+        client == C_NULL && throw(AssertionError(unsafe_string(refstr[])))
+        XLA.LLVMclopts("-nvptx-fma-level=1")
+        if checkcount
+            # Only increment the counter if we successfully created a client
             $(counter)[] += 1
         end
-        return Client(XLA.$(backend)($(fname), args...; kwargs...))
+        return Client(client)
     end
 end
+
+function MakeIFRTPJRTCPUClient(;
+    node_id::Integer=0,
+    num_nodes::Integer=1,
+    asynchronous::Bool=true,
+    distributed_runtime_client::Union{Nothing,XLA.DistributedRuntimeClient}=nothing,
+)
+    refstr = Ref{Cstring}()
+    distributed_runtime_client =
+        distributed_runtime_client === nothing ? C_NULL : distributed_runtime_client.client
+
+    GC.@preserve refstr distributed_runtime_client begin
+        client = @ccall MLIR.API.mlir_c.ifrt_make_pjrt_cpu_client(
+            asynchronous::UInt8,
+            node_id::Cint,
+            num_nodes::Cint,
+            distributed_runtime_client::Ptr{Cvoid},
+            refstr::Ptr{Cstring},
+        )::Ptr{Cvoid}
+    end
+
+    return client, refstr
+end
+
+function MakeIFRTPJRTGPUClient(;
+    node_id::Integer=0,
+    num_nodes::Integer=1,
+    platform::String="gpu",
+    allowed_devices::Union{Nothing,Vector{Int}}=nothing,
+    distributed_runtime_client::Union{Nothing,XLA.DistributedRuntimeClient}=nothing,
+)
+    refstr = Ref{Cstring}()
+
+    num_allowed_devices = allowed_devices === nothing ? 0 : length(allowed_devices)
+    allowed_devices = allowed_devices === nothing ? C_NULL : allowed_devices
+    distributed_runtime_client =
+        distributed_runtime_client === nothing ? C_NULL : distributed_runtime_client.client
+
+    GC.@preserve refstr allowed_devices distributed_runtime_client begin
+        client = @ccall MLIR.API.mlir_c.ifrt_make_pjrt_gpu_client(
+            node_id::Cint,
+            num_nodes::Cint,
+            allowed_devices::Ptr{Cvoid},
+            num_allowed_devices::Cint,
+            XLA.XLA_REACTANT_GPU_MEM_FRACTION[]::Cdouble,
+            XLA.XLA_REACTANT_GPU_PREALLOCATE[]::Bool,
+            platform::Cstring,
+            refstr::Ptr{Cstring},
+            distributed_runtime_client::Ptr{Cvoid},
+        )::Ptr{Cvoid}
+    end
+
+    return client, refstr
+end
+
+function MakeIFRTPJRTTPUClient(;
+    tpu_path::String,
+    node_id::Integer=0,
+    num_nodes::Integer=1,
+    distributed_runtime_client::Union{Nothing,XLA.DistributedRuntimeClient}=nothing,
+)
+    refstr = Ref{Cstring}()
+    distributed_runtime_client =
+        distributed_runtime_client === nothing ? C_NULL : distributed_runtime_client.client
+
+    GC.@preserve refstr distributed_runtime_client begin
+        client = @ccall MLIR.API.mlir_c.ifrt_make_pjrt_tpu_client(
+            tpu_path::Cstring,
+            refstr::Ptr{Cstring},
+            node_id::Cint,
+            num_nodes::Cint,
+            distributed_runtime_client::Ptr{Cvoid},
+        )::Ptr{Cvoid}
+    end
+
+    return client, refstr
+end
@@ -89,20 +89,91 @@ const cpu_client_count = Ref(0)
 const gpu_client_count = Ref(0)
 const tpu_client_count = Ref(0)
 
-for (backend, fname, counter) in (
-    (:CPUClient, "MakeCPUClient", :cpu_client_count),
-    (:GPUClient, "MakeGPUClient", :gpu_client_count),
-    (:TPUClient, "MakeTPUClient", :tpu_client_count),
+for (backend, counter) in (
+    (:CPUClient, :cpu_client_count),
+    (:GPUClient, :gpu_client_count),
+    (:TPUClient, :tpu_client_count),
 )
+    main_fn = Symbol(:Make, backend)
     @eval function $(backend)(args...; checkcount::Bool=true, kwargs...)
         if checkcount
             @assert $(counter)[] == 0
         end
-        client = Client(XLA.$(backend)($(fname), args...; kwargs...))
+        client = Client($(main_fn)(args...; kwargs...))
+        XLA.LLVMclopts("-nvptx-fma-level=1")
         if checkcount
             # Only increment the counter if we successfully created a client
             $(counter)[] += 1
         end
         return client
     end
 end
+
+function MakeCPUClient(;
+    node_id::Integer=0,
+    num_nodes::Integer=1,
+    asynchronous::Bool=true,
+    distributed_runtime_client::Union{Nothing,XLA.DistributedRuntimeClient}=nothing,
+)
+    @assert num_nodes == 1 "`PJRT.MakeCPUClient` does not support num_nodes > 1"
+    @assert distributed_runtime_client === nothing "`PJRT.MakeCPUClient` does not support \
+                                                    distributed_runtime_client"
+
+    return @ccall MLIR.API.mlir_c.MakeCPUClient(
+        asynchronous::UInt8, node_id::Cint
+    )::Ptr{Cvoid}
+end
+
+function MakeGPUClient(;
+    node_id::Integer=0,
+    num_nodes::Integer=1,
+    platform::String="gpu",
+    allowed_devices::Union{Nothing,Vector{Int}}=nothing,
+    distributed_runtime_client::Union{Nothing,XLA.DistributedRuntimeClient}=nothing,
+)
+    refstr = Ref{Cstring}()
+
+    num_allowed_devices = allowed_devices === nothing ? 0 : length(allowed_devices)
+    allowed_devices = allowed_devices === nothing ? C_NULL : allowed_devices
+    distributed_runtime_client =
+        distributed_runtime_client === nothing ? C_NULL : distributed_runtime_client.client
+
+    GC.@preserve refstr allowed_devices distributed_runtime_client begin
+        client = @ccall MLIR.API.mlir_c.MakeGPUClient(
+            node_id::Cint,
+            num_nodes::Cint,
+            allowed_devices::Ptr{Cvoid},
+            num_allowed_devices::Cint,
+            XLA.XLA_REACTANT_GPU_MEM_FRACTION[]::Cdouble,
+            XLA.XLA_REACTANT_GPU_PREALLOCATE[]::Bool,
+            platform::Cstring,
+            refstr::Ptr{Cstring},
+            distributed_runtime_client::Ptr{Cvoid},
+        )::Ptr{Cvoid}
+    end
+
+    client == C_NULL && throw(AssertionError(unsafe_string(refstr[])))
+    return client
+end
+
+function MakeTPUClient(;
+    tpu_path::String,
+    node_id::Integer=0,
+    num_nodes::Integer=1,
+    distributed_runtime_client::Union{Nothing,XLA.DistributedRuntimeClient}=nothing,
+)
+    @assert node_id == 0 "`PJRT.MakeTPUClient` does not support node_id"
+    @assert num_nodes == 1 "`PJRT.MakeTPUClient` does not support num_nodes > 1"
+    @assert distributed_runtime_client === nothing "`PJRT.MakeTPUClient` does not support \
+                                                    distributed_runtime_client"
+
+    refstr = Ref{Cstring}()
+    GC.@preserve refstr begin
+        client = @ccall MLIR.API.mlir_c.MakeTPUClient(
+            tpu_path::Cstring, refstr::Ptr{Cstring}
+        )::Ptr{Cvoid}
+    end
+
+    client == C_NULL && throw(AssertionError(unsafe_string(refstr[])))
+    return client
+end
@@ -37,25 +37,25 @@ include("PJRT/PJRT.jl")
 
 include("IFRT/IFRT.jl")
 
-@kwdef mutable struct BackendState
+@kwdef mutable struct PJRTBackendState
     initialized::Bool = false
     clients::Dict{String,PJRT.Client} = Dict{String,PJRT.Client}()
     default_client::PJRT.Client = PJRT.Client(C_NULL; skip_check=true)
 end
 
-function Base.getproperty(bs::BackendState, sym::Symbol)
+function Base.getproperty(bs::PJRTBackendState, sym::Symbol)
     (sym === :initialized || bs.initialized) && return getfield(bs, sym)
-    initialize_default_clients!(bs)
+    initialize_default_pjrt_clients!(bs)
     return getfield(bs, sym)
 end
 
-function Base.setproperty!(bs::BackendState, sym::Symbol, val)
+function Base.setproperty!(bs::PJRTBackendState, sym::Symbol, val)
     (sym === :initialized || bs.initialized) && return setfield!(bs, sym, val)
-    initialize_default_clients!(bs)
+    initialize_default_pjrt_clients!(bs)
     return setfield!(bs, sym, val)
 end
 
-const global_backend_state = BackendState()
+const global_backend_state = PJRTBackendState()
 const global_state = State()
 
 client(backend::String) = global_backend_state.clients[backend]
@@ -74,8 +74,13 @@ end
 
 function update_global_state!(args...; kwargs...)
     update!(global_state, args...; kwargs...)
-    # We need to update the clients based on the new state
-    initialize_default_clients!(global_backend_state)
+    # We conditionally initialize for now, since a lot of options that are set are not
+    # necessarily supported by PJRT. This makes testing for IFRT quite hard.
+    # Once we move to IFRT completely, we can remove this.
+    if global_backend_state.initialized
+        # We need to update the clients based on the new state
+        initialize_default_pjrt_clients!(global_backend_state)
+    end
     return nothing
 end
 
@@ -112,16 +117,27 @@ function __init__()
     return nothing
 end
 
-function initialize_default_clients!(state::BackendState)
+function initialize_default_pjrt_clients!(state::PJRTBackendState)
     was_initialized = state.initialized
     state.initialized = true
+    distributed_runtime_client = if global_state.num_processes > 1
+        @assert global_state.client !== nothing
+        global_state.client
+    else
+        nothing
+    end
+    common_kwargs = (;
+        node_id=global_state.process_id,
+        num_nodes=global_state.num_processes,
+        distributed_runtime_client,
+    )
 
     # CPU
     if was_initialized && haskey(state.clients, "cpu")
         XLA.free_client(state.clients["cpu"])
         XLA.PJRT.cpu_client_count[] -= 1
     end
-    cpu = PJRT.CPUClient(global_state.process_id, global_state.num_processes)
+    cpu = PJRT.CPUClient(; common_kwargs..., asynchronous=true)
     state.clients["cpu"] = cpu
     state.default_client = cpu
 
@@ -144,8 +160,9 @@ function initialize_default_clients!(state::BackendState)
                     XLA.free_client(state.clients["tpu"])
                     XLA.PJRT.tpu_client_count[] -= 1
                 end
-                # XXX: process_id? num_processes?
-                tpu = PJRT.TPUClient(dataset_dir * "/libtpu.so")
+                tpu = PJRT.TPUClient(;
+                    tpu_path=dataset_dir * "/libtpu.so", common_kwargs...
+                )
                 state.clients["tpu"] = tpu
                 state.default_client = tpu
             catch e
@@ -154,22 +171,12 @@ function initialize_default_clients!(state::BackendState)
         else
             if !Reactant.precompiling()
                 try
-                    distributed_runtime_client = if global_state.num_processes > 1
-                        @assert global_state.client !== nothing
-                        global_state.client
-                    else
-                        nothing
-                    end
-
                     if was_initialized && haskey(state.clients, "gpu")
                         XLA.free_client(state.clients["gpu"])
                         XLA.PJRT.gpu_client_count[] -= 1
                     end
-                    gpu = PJRT.GPUClient(
-                        global_state.process_id,
-                        global_state.num_processes;
-                        allowed_devices=global_state.local_device_ids,
-                        distributed_runtime_client,
+                    gpu = PJRT.GPUClient(;
+                        common_kwargs..., allowed_devices=global_state.local_device_ids
                     )
                     state.clients["gpu"] = gpu
                     state.default_client = gpu