ggerganov · ggerganov · Jul 10, 2023 · Jul 4, 2023 · Jul 4, 2023 · Jul 4, 2023
diff --git a/Makefile b/Makefile
@@ -149,6 +149,11 @@ ifndef LLAMA_NO_ACCELERATE
  endif
 endif # LLAMA_NO_ACCELERATE
 
+ifdef LLAMA_MPI
+ CFLAGS += -DGGML_USE_MPI -Wno-cast-qual -Wno-int-to-void-pointer-cast -Wno-void-pointer-to-int-cast
+ CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+endif # LLAMA_MPI
+
 ifdef LLAMA_OPENBLAS
  CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
  LDFLAGS += -lopenblas

diff --git a/README.md b/README.md
@@ -267,6 +267,35 @@ Any value larger than 0 will offload the computation to the GPU. For example:
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
 ```
 
+### MPI Build
+
+MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+
+First, build llama.cpp and download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines. You will need to build llama.cpp with an MPI-capable compiler, for example,
+
+```bash
+make CC=mpicc CXX=mpicxx LLAMA_MPI=1
+```
+
+Once the programs are built and the weights are downloaded on all machines, ensure password-less SSH access to each machine from the primary host.
+
+Next, create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
+
+Here is an example hostfile:
+
+```
+192.168.0.1:2
+malvolio.local:1
+```
+
+The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
+
+Finally, you're ready to run a computation using `mpirun`:
+
+```bash
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+```
+
 ### BLAS Build
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
  llama_free(ctx);
  llama_free_model(model);
 
+ llama_finalize_backend();
+
  return 0;
 }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
  llama_free(ctx);
  llama_free_model(model);
 
+ llama_finalize_backend();
+
  return 0;
 }
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -173,6 +173,8 @@ int main(int argc, char ** argv)
  llama_free( ctx );
  llama_free_model( model );
 
+ llama_finalize_backend();
+
  return 0;
 }
 

diff --git a/ggml.c b/ggml.c
@@ -26,6 +26,10 @@
 #include <limits.h>
 #include <stdarg.h>
 
+#ifdef GGML_USE_MPI
+#include <mpi.h>
+#endif
+
 #ifdef GGML_USE_METAL
 #include <unistd.h>
 #endif
@@ -4648,6 +4652,36 @@ struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggm
  return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);
 }
 
+struct ggml_tensor * ggml_send_tensor(
+ struct ggml_context * ctx,
+ struct ggml_tensor *src,
+ int dst_rank) {
+
+ struct ggml_tensor * result = ggml_new_i32(ctx, 0);
+
+ result->op = GGML_OP_SEND;
+ result->src0 = src;
+ result->extra = (void *)dst_rank;
+
+ return result;
+}
+
+struct ggml_tensor * ggml_recv_tensor(
+ struct ggml_context * ctx,
+ struct ggml_tensor *parent,
+ struct ggml_tensor *dst,
+ int src_rank) {
+ UNUSED(ctx);
+
+ struct ggml_tensor * result = dst;
+
+ result->op = GGML_OP_RECV;
+ result->src0 = parent; // just used for graph computation
+ result->extra = (void *)src_rank;
+
+ return result;
+}
+
 struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
  memset(tensor->data, 0, ggml_nbytes(tensor));
  return tensor;
@@ -8191,6 +8225,52 @@ static void ggml_compute_forward_dup(
  }
 }
 
+// ggml_compute_forward_recv
+
+static void ggml_compute_forward_recv(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#ifdef GGML_USE_MPI
+ MPI_Status status;
+ int my_rank;
+ MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+ // fprintf(stderr, "(%d) Receiving from (%d)\n", my_rank, (int)dst->extra);
+ int retval = MPI_Recv(dst->data, dst->ne[0] * dst->ne[1], MPI_FLOAT, (int)dst->extra, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
+ // fprintf(stderr, "(%d) Received from (%d)\n", my_rank, (int)dst->extra);
+ GGML_ASSERT(retval == MPI_SUCCESS);
+#else
+ GGML_ASSERT(false);
+#endif
+}
+
+// ggml_compute_forward_send
+
+static void ggml_compute_forward_send(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * src,
+ struct ggml_tensor * dst) {
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_I32);
+#ifdef GGML_USE_MPI
+ int my_rank;
+ MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+ // fprintf(stderr, "(%d) Sending to (%d)\n", my_rank, (int)dst->extra);
+ int retval = MPI_Send(src->data, src->ne[0] * src->ne[1], MPI_FLOAT, (int)dst->extra, 0, MPI_COMM_WORLD);
+ // fprintf(stderr, "(%d) Sent to (%d)\n", my_rank, (int)dst->extra);
+ ggml_set_i32(dst, retval);
+ GGML_ASSERT(retval == MPI_SUCCESS);
+#else
+ GGML_ASSERT(false);
+#endif
+}
+
 // ggml_compute_forward_add
 
 static void ggml_compute_forward_add_f32(
@@ -15420,6 +15500,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
  {
  ggml_compute_forward_dup(params, tensor->src0, tensor);
  } break;
+ case GGML_OP_SEND:
+ {
+ ggml_compute_forward_send(params, tensor->src0, tensor);
+ } break;
+ case GGML_OP_RECV:
+ {
+ ggml_compute_forward_recv(params, tensor);
+ } break;
  case GGML_OP_ADD:
  {
  ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor);
@@ -15710,6 +15798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
  src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
  }
  } break;
+ case GGML_OP_SEND:
+ {
+ GGML_ASSERT(false); // TODO: not implemented
+ } break;
+ case GGML_OP_RECV:
+ {
+ GGML_ASSERT(false); // TODO: not implemented
+ } break;
  case GGML_OP_ADD:
  {
  if (src0->grad) {
@@ -17058,6 +17154,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
  {
  node->n_tasks = 1;
  } break;
+ case GGML_OP_SEND:
+ case GGML_OP_RECV:
  case GGML_OP_SET:
  case GGML_OP_CONT:
  case GGML_OP_RESHAPE:

diff --git a/ggml.h b/ggml.h
@@ -353,6 +353,9 @@ extern "C" {
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,
 
  GGML_OP_COUNT,
+
+ GGML_OP_SEND,
+ GGML_OP_RECV,
  };
 
 
@@ -556,6 +559,16 @@ extern "C" {
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
 
+ GGML_API struct ggml_tensor * ggml_send_tensor(
+ struct ggml_context * ctx,
+ struct ggml_tensor *src,
+ int dst_rank);
+ GGML_API struct ggml_tensor * ggml_recv_tensor(
+ struct ggml_context * ctx,
+ struct ggml_tensor *parent,
+ struct ggml_tensor *dst,
+ int src_rank);
+
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
 
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);