diff --git a/examples/mnist/README.md b/examples/mnist/README.md
index 9e0966f441..afe2a3ef7d 100644
--- a/examples/mnist/README.md
+++ b/examples/mnist/README.md
@@ -18,7 +18,7 @@ $ python3 mnist-train-fc.py mnist-fc-f32.gguf
 
 ...
 
-Test loss: 0.066051+-0.011630, Test accuracy: 98.07+-0.14%
+Test loss: 0.066377+-0.010468, Test accuracy: 97.94+-0.14%
 
 Model tensors saved to mnist-fc-f32.gguf:
 fc1.weight       (500, 784)
@@ -61,22 +61,21 @@ ________________________________________________________
 ________________________________________________________
 ________________________________________________________
 ________________________________________________________
-mnist_graph_eval: trying to load a ggml graph from mnist-fc-f32.gguf
-ggml_graph_import: invalid magic number, got 46554747
-mnist_graph_eval: could not load a ggml graph from mnist-fc-f32.gguf
 ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
 ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
-mnist_model: using CPU backend
+mnist_model: using CUDA0 (NVIDIA GeForce RTX 3090) as primary backend
+mnist_model: unsupported operations will be executed on the following fallback backends (in order of priority):
+mnist_model:  - CPU (AMD Ryzen 9 5950X 16-Core Processor)
 mnist_model_init_from_file: loading model weights from 'mnist-fc-f32.gguf'
 mnist_model_init_from_file: model arch is mnist-fc
 mnist_model_init_from_file: successfully loaded weights from mnist-fc-f32.gguf
-main: loaded model in 13.03 ms
-mnist_model_eval: model evaluation on 10000 images took 95.02 ms, 9.50 us/image
+main: loaded model in 109.44 ms
+mnist_model_eval: model evaluation on 10000 images took 76.92 ms, 7.69 us/image
 main: predicted digit is 3
-main: test_loss=0.066051+-0.009343
-main: test_acc=98.07+-0.14%
+main: test_loss=0.066379+-0.009101
+main: test_acc=97.94+-0.14%
 ```
 
 In addition to the evaluation on the test set the GGML evaluation also prints a random image from the test set as well as the model prediction for said image.
@@ -87,10 +86,6 @@ $ ../../build/bin/mnist-train mnist-fc mnist-fc-f32.gguf data/MNIST/raw/train-im
 ```
 
 It can then be evaluated with the same binary as above.
-When training a model with GGML the computation graph for the forward pass is also exported to `mnist-fc-f32.ggml`.
-Compared to the GGUF (which only contains the weights) this file also contains the model architecture.
-As long as the input and output tensors are well-defined an exported GGML graph is fully agnostic w.r.t. the model architecture.
-It can be evaluated using the `mnist-eval` binary by substituting the argument for the GGUF file.
 
 ## Convolutional network
 
@@ -101,8 +96,8 @@ $ python3 mnist-train-cnn.py mnist-cnn-f32.gguf
 
 ...
 
-Test loss: 0.045483
-Test accuracy: 98.56%
+Test loss: 0.047947
+Test accuracy: 98.46%
 GGUF model saved to 'mnist-cnn-f32.gguf'
 ```
 
@@ -139,22 +134,21 @@ ________________________________________________________
 ________________________________________________________
 ________________________________________________________
 ________________________________________________________
-mnist_graph_eval: trying to load a ggml graph from mnist-cnn-f32.gguf
-ggml_graph_import: invalid magic number, got 46554747
-mnist_graph_eval: could not load a ggml graph from mnist-cnn-f32.gguf
 ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
 ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
 ggml_cuda_init: found 1 CUDA devices:
   Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
-mnist_model: using CPU backend
+mnist_model: using CUDA0 (NVIDIA GeForce RTX 3090) as primary backend
+mnist_model: unsupported operations will be executed on the following fallback backends (in order of priority):
+mnist_model:  - CPU (AMD Ryzen 9 5950X 16-Core Processor)
 mnist_model_init_from_file: loading model weights from 'mnist-cnn-f32.gguf'
 mnist_model_init_from_file: model arch is mnist-cnn
 mnist_model_init_from_file: successfully loaded weights from mnist-cnn-f32.gguf
-main: loaded model in 11.88 ms
-mnist_model_eval: model evaluation on 10000 images took 1074.09 ms, 107.41 us/image
+main: loaded model in 91.99 ms
+mnist_model_eval: model evaluation on 10000 images took 267.61 ms, 26.76 us/image
 main: predicted digit is 1
-main: test_loss=0.045483+-0.006884
-main: test_acc=98.56+-0.12%
+main: test_loss=0.047955+-0.007029
+main: test_acc=98.46+-0.12%
 ```
 
 Like with the fully connected network the convolutional network can also be trained on the CPU using GGML:
@@ -165,11 +159,12 @@ $ ../../build/bin/mnist-train mnist-cnn mnist-cnn-f32.gguf data/MNIST/raw/train-
 
 As always, the evaluation is done using `mnist-eval` and like with the fully connected network the GGML graph is exported to `mnist-cnn-f32.ggml`.
 
-## CUDA
+## Hardware Acceleration
 
-The fully connected model can be trained and evaluated using CUDA.
-`mnist-train` and `mnist-eval` accept an additional, optional argument behind those listed so far to specify the backend.
-The default is `CPU`, by specifying `CUDA0` the first available CUDA device can be used instead (make sure to compile GGML with CUDA cupport).
+Both the training and evaluation code is agnostic in terms of hardware as long as the corresponding GGML backend has implemented the necessary operations.
+A specific backend can be selected by appending the above commands with a backend name.
+The compute graphs then schedule the operations to preferentially use the specified backend.
+Note that if a backend does not implement some of the necessary operations a CPU fallback is used instead which may result in bad performance.
 
 ## Web demo
 
diff --git a/examples/mnist/mnist-common.h b/examples/mnist/mnist-common.h
index a871d030a5..090cf37159 100644
--- a/examples/mnist/mnist-common.h
+++ b/examples/mnist/mnist-common.h
@@ -33,64 +33,6 @@ static_assert(MNIST_NTEST  % MNIST_NBATCH_LOGICAL == 0, "MNIST_NTRAIN % MNIST_NB
 // NCB = number of channels base
 #define MNIST_CNN_NCB 8
 
-struct mnist_dataset {
-    struct ggml_context * ctx;
-    struct ggml_tensor  * data;
-    struct ggml_tensor  * labels;
-
-    int64_t nex;
-    int64_t shard_size;
-    size_t  nbs_data;
-    size_t  nbs_labels;
-
-    std::vector<int64_t> permutation;
-    std::mt19937 rng;
-
-    mnist_dataset(const int64_t nex, const int64_t shard_size) : nex(nex), shard_size(shard_size) {
-        const size_t nbytes_images = nex*MNIST_NINPUT  *sizeof(float) + ggml_tensor_overhead();
-        const size_t nbytes_labels = nex*MNIST_NCLASSES*sizeof(float) + ggml_tensor_overhead();
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ nbytes_images + nbytes_labels,
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ false,
-        };
-        ctx = ggml_init(params);
-
-        data   = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, MNIST_HW, MNIST_HW, nex);
-        labels = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, MNIST_NCLASSES,     nex);
-
-        nbs_data   = ggml_nbytes(data)   * shard_size/nex;
-        nbs_labels = ggml_nbytes(labels) * shard_size/nex;
-
-        permutation.resize(nex/shard_size);
-        for (size_t i = 0; i < permutation.size(); ++i) {
-            permutation[i] = i;
-        }
-    }
-
-    ~mnist_dataset() {
-        ggml_free(ctx);
-    }
-
-    void shuffle(const size_t ishard_max) {
-        if (ishard_max < permutation.size()) {
-            std::shuffle(permutation.begin(), permutation.begin() + ishard_max, rng);
-            return;
-        }
-        std::shuffle(permutation.begin(), permutation.end(), rng);
-    }
-
-    void get_batch(struct ggml_tensor * data_batch, struct ggml_tensor * labels_batch, const int64_t ibatch) {
-        const int64_t shards_per_batch = ggml_nbytes(data_batch) / nbs_data;
-        for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
-            const int64_t ishard = permutation[ibatch*shards_per_batch + ishard_batch];
-
-            ggml_backend_tensor_set(data_batch,   (const char *)   data->data + ishard*nbs_data,   ishard_batch*nbs_data,   nbs_data);
-            ggml_backend_tensor_set(labels_batch, (const char *) labels->data + ishard*nbs_labels, ishard_batch*nbs_labels, nbs_labels);
-        }
-    }
-};
-
 struct mnist_model {
     std::string arch;
     ggml_backend_sched_t backend_sched;
diff --git a/include/ggml-backend.h b/include/ggml-backend.h
index b02b43a2ba..82537819d1 100644
--- a/include/ggml-backend.h
+++ b/include/ggml-backend.h
@@ -77,11 +77,6 @@ extern "C" {
     GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
     GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
 
-    // by default each ggml_tensor is intended to be allocated exactly once
-    // if a tensor is allocated multiple times the pointers to its allocations need to be explicitly cleared with this function
-    // note that this does NOT free the memory for the actual allocations, that is done via e.g. ggml_backend_sched
-    GGML_API void ggml_backend_tensor_reset(struct ggml_tensor * tensor);
-
     GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
 
     GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
@@ -288,7 +283,7 @@ extern "C" {
 
     // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
     // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
-    // The correct way to use this API is to either discard all deallocated tensors or to reset them via ggml_backend_tensor_reset.
+    // The correct way to use this API is to discard the deallocated tensors and create new ones.
     GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
 
     // Set a callback to be called for each resulting node during graph compute
diff --git a/include/ggml-opt.h b/include/ggml-opt.h
index 43e7b76f56..c868f98dca 100644
--- a/include/ggml-opt.h
+++ b/include/ggml-opt.h
@@ -25,8 +25,8 @@ extern "C" {
 
     // ====== Loss ======
 
-    // built-in loss types the quantity minimized by the optimizer
-    // custom loss types can be defined via mean or sum which reduce the outputs for all datapoints to a single value
+    // built-in loss types, i.e. the built-in quantities minimized by the optimizer
+    // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
     enum ggml_opt_loss_type {
         GGML_OPT_LOSS_TYPE_MEAN,
         GGML_OPT_LOSS_TYPE_SUM,
@@ -71,16 +71,19 @@ extern "C" {
         } adamw;
     };
 
-    // callback to calculate optimizer parameters with arbitrary data that can be set by the user
+    // callback to calculate optimizer parameters prior to a backward pass
+    // userdata can be used to pass arbitrary data
     typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
 
+    // returns the default optimizer params (constant)
+    // userdata is not used
     GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
 
     // parameters for initializing a new optimization context
     struct ggml_opt_params {
-        ggml_backend_sched_t backend_sched;
+        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
 
-        struct ggml_context * ctx_compute;
+        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
 
         // the forward graph is defined by inputs and outputs
         // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
@@ -126,10 +129,10 @@ extern "C" {
     GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
 
     // get data from result, uncertainties are optional and can be ignored by passing NULL
-    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // write 1 value, number of datapoints
-    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // write 1 value
-    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // write ndata values
-    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // write 1 value
+    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // writes 1 value, number of datapoints
+    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // writes 1 value
+    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // writes ndata values
+    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // writes 1 value
 
     // ====== Computation ======
 
diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp
index b074b044ce..ec9d5f26c9 100644
--- a/src/ggml-backend.cpp
+++ b/src/ggml-backend.cpp
@@ -294,14 +294,6 @@ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size
     buf->iface.memset_tensor(buf, tensor, value, offset, size);
 }
 
-void ggml_backend_tensor_reset(struct ggml_tensor * tensor) {
-    // the allocation/deallocation of the memory that these pointers point to is not the responsibility of ggml_tensor
-    // they can therefore simply be cleared without creating a memory leak
-    tensor->data   = nullptr;
-    tensor->buffer = nullptr;
-    tensor->extra  = nullptr;
-}
-
 void ggml_backend_synchronize(ggml_backend_t backend) {
     if (backend->iface.synchronize == NULL) {
         return;
diff --git a/src/ggml-opt.cpp b/src/ggml-opt.cpp
index d05291c00f..0802c84360 100644
--- a/src/ggml-opt.cpp
+++ b/src/ggml-opt.cpp
@@ -369,6 +369,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
         case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
             result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
             ggml_set_input(result->labels);
+            ggml_set_name(result->labels, "labels");
             result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
             ggml_set_name(result->loss, "loss_cross_entropy");
             if (result->opt_period > 1) {
@@ -381,6 +382,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
         case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
             result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
             ggml_set_input(result->labels);
+            ggml_set_name(result->labels, "labels");
             result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels);
             ggml_set_name(result->loss, "loss_error");
             result->loss = ggml_sqr(result->ctx_static, result->loss);
@@ -417,6 +419,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
         result->gb_opt  = nullptr;
 
         result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+        result->buf_static_cpu = nullptr;
 
         ggml_opt_alloc_graph(result, result->gf);
 
@@ -576,7 +579,7 @@ void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, doubl
 // ====== Computation ======
 
 static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) {
-    {
+    if (graph != opt_ctx->gf) {
         struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
 
         GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
diff --git a/src/ggml.c b/src/ggml.c
index 8d94656d63..70c74014c3 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -6456,7 +6456,6 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
         // initial gradients of loss should be 1, 0 otherwise
         struct ggml_tensor * grad = node->grad;
         while (grad && !grad->data && grad->view_src) {
-            GGML_ASSERT(grad->view_offs == 0); // FIXME
             grad = grad->view_src;
         }
         if (grad && grad->data) {