diff --git a/examples/mnist/README.md b/examples/mnist/README.md index 9e0966f441..afe2a3ef7d 100644 --- a/examples/mnist/README.md +++ b/examples/mnist/README.md @@ -18,7 +18,7 @@ $ python3 mnist-train-fc.py mnist-fc-f32.gguf ... -Test loss: 0.066051+-0.011630, Test accuracy: 98.07+-0.14% +Test loss: 0.066377+-0.010468, Test accuracy: 97.94+-0.14% Model tensors saved to mnist-fc-f32.gguf: fc1.weight (500, 784) @@ -61,22 +61,21 @@ ________________________________________________________ ________________________________________________________ ________________________________________________________ ________________________________________________________ -mnist_graph_eval: trying to load a ggml graph from mnist-fc-f32.gguf -ggml_graph_import: invalid magic number, got 46554747 -mnist_graph_eval: could not load a ggml graph from mnist-fc-f32.gguf ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes -mnist_model: using CPU backend +mnist_model: using CUDA0 (NVIDIA GeForce RTX 3090) as primary backend +mnist_model: unsupported operations will be executed on the following fallback backends (in order of priority): +mnist_model: - CPU (AMD Ryzen 9 5950X 16-Core Processor) mnist_model_init_from_file: loading model weights from 'mnist-fc-f32.gguf' mnist_model_init_from_file: model arch is mnist-fc mnist_model_init_from_file: successfully loaded weights from mnist-fc-f32.gguf -main: loaded model in 13.03 ms -mnist_model_eval: model evaluation on 10000 images took 95.02 ms, 9.50 us/image +main: loaded model in 109.44 ms +mnist_model_eval: model evaluation on 10000 images took 76.92 ms, 7.69 us/image main: predicted digit is 3 -main: test_loss=0.066051+-0.009343 -main: test_acc=98.07+-0.14% +main: test_loss=0.066379+-0.009101 +main: test_acc=97.94+-0.14% ``` In addition to the evaluation on the test set the GGML evaluation also prints a random image from the test set as well as the model prediction for said image. @@ -87,10 +86,6 @@ $ ../../build/bin/mnist-train mnist-fc mnist-fc-f32.gguf data/MNIST/raw/train-im ``` It can then be evaluated with the same binary as above. -When training a model with GGML the computation graph for the forward pass is also exported to `mnist-fc-f32.ggml`. -Compared to the GGUF (which only contains the weights) this file also contains the model architecture. -As long as the input and output tensors are well-defined an exported GGML graph is fully agnostic w.r.t. the model architecture. -It can be evaluated using the `mnist-eval` binary by substituting the argument for the GGUF file. ## Convolutional network @@ -101,8 +96,8 @@ $ python3 mnist-train-cnn.py mnist-cnn-f32.gguf ... -Test loss: 0.045483 -Test accuracy: 98.56% +Test loss: 0.047947 +Test accuracy: 98.46% GGUF model saved to 'mnist-cnn-f32.gguf' ``` @@ -139,22 +134,21 @@ ________________________________________________________ ________________________________________________________ ________________________________________________________ ________________________________________________________ -mnist_graph_eval: trying to load a ggml graph from mnist-cnn-f32.gguf -ggml_graph_import: invalid magic number, got 46554747 -mnist_graph_eval: could not load a ggml graph from mnist-cnn-f32.gguf ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes -mnist_model: using CPU backend +mnist_model: using CUDA0 (NVIDIA GeForce RTX 3090) as primary backend +mnist_model: unsupported operations will be executed on the following fallback backends (in order of priority): +mnist_model: - CPU (AMD Ryzen 9 5950X 16-Core Processor) mnist_model_init_from_file: loading model weights from 'mnist-cnn-f32.gguf' mnist_model_init_from_file: model arch is mnist-cnn mnist_model_init_from_file: successfully loaded weights from mnist-cnn-f32.gguf -main: loaded model in 11.88 ms -mnist_model_eval: model evaluation on 10000 images took 1074.09 ms, 107.41 us/image +main: loaded model in 91.99 ms +mnist_model_eval: model evaluation on 10000 images took 267.61 ms, 26.76 us/image main: predicted digit is 1 -main: test_loss=0.045483+-0.006884 -main: test_acc=98.56+-0.12% +main: test_loss=0.047955+-0.007029 +main: test_acc=98.46+-0.12% ``` Like with the fully connected network the convolutional network can also be trained on the CPU using GGML: @@ -165,11 +159,12 @@ $ ../../build/bin/mnist-train mnist-cnn mnist-cnn-f32.gguf data/MNIST/raw/train- As always, the evaluation is done using `mnist-eval` and like with the fully connected network the GGML graph is exported to `mnist-cnn-f32.ggml`. -## CUDA +## Hardware Acceleration -The fully connected model can be trained and evaluated using CUDA. -`mnist-train` and `mnist-eval` accept an additional, optional argument behind those listed so far to specify the backend. -The default is `CPU`, by specifying `CUDA0` the first available CUDA device can be used instead (make sure to compile GGML with CUDA cupport). +Both the training and evaluation code is agnostic in terms of hardware as long as the corresponding GGML backend has implemented the necessary operations. +A specific backend can be selected by appending the above commands with a backend name. +The compute graphs then schedule the operations to preferentially use the specified backend. +Note that if a backend does not implement some of the necessary operations a CPU fallback is used instead which may result in bad performance. ## Web demo diff --git a/examples/mnist/mnist-common.h b/examples/mnist/mnist-common.h index a871d030a5..090cf37159 100644 --- a/examples/mnist/mnist-common.h +++ b/examples/mnist/mnist-common.h @@ -33,64 +33,6 @@ static_assert(MNIST_NTEST % MNIST_NBATCH_LOGICAL == 0, "MNIST_NTRAIN % MNIST_NB // NCB = number of channels base #define MNIST_CNN_NCB 8 -struct mnist_dataset { - struct ggml_context * ctx; - struct ggml_tensor * data; - struct ggml_tensor * labels; - - int64_t nex; - int64_t shard_size; - size_t nbs_data; - size_t nbs_labels; - - std::vector permutation; - std::mt19937 rng; - - mnist_dataset(const int64_t nex, const int64_t shard_size) : nex(nex), shard_size(shard_size) { - const size_t nbytes_images = nex*MNIST_NINPUT *sizeof(float) + ggml_tensor_overhead(); - const size_t nbytes_labels = nex*MNIST_NCLASSES*sizeof(float) + ggml_tensor_overhead(); - struct ggml_init_params params = { - /*.mem_size =*/ nbytes_images + nbytes_labels, - /*.mem_buffer =*/ nullptr, - /*.no_alloc =*/ false, - }; - ctx = ggml_init(params); - - data = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, MNIST_HW, MNIST_HW, nex); - labels = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, MNIST_NCLASSES, nex); - - nbs_data = ggml_nbytes(data) * shard_size/nex; - nbs_labels = ggml_nbytes(labels) * shard_size/nex; - - permutation.resize(nex/shard_size); - for (size_t i = 0; i < permutation.size(); ++i) { - permutation[i] = i; - } - } - - ~mnist_dataset() { - ggml_free(ctx); - } - - void shuffle(const size_t ishard_max) { - if (ishard_max < permutation.size()) { - std::shuffle(permutation.begin(), permutation.begin() + ishard_max, rng); - return; - } - std::shuffle(permutation.begin(), permutation.end(), rng); - } - - void get_batch(struct ggml_tensor * data_batch, struct ggml_tensor * labels_batch, const int64_t ibatch) { - const int64_t shards_per_batch = ggml_nbytes(data_batch) / nbs_data; - for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) { - const int64_t ishard = permutation[ibatch*shards_per_batch + ishard_batch]; - - ggml_backend_tensor_set(data_batch, (const char *) data->data + ishard*nbs_data, ishard_batch*nbs_data, nbs_data); - ggml_backend_tensor_set(labels_batch, (const char *) labels->data + ishard*nbs_labels, ishard_batch*nbs_labels, nbs_labels); - } - } -}; - struct mnist_model { std::string arch; ggml_backend_sched_t backend_sched; diff --git a/include/ggml-backend.h b/include/ggml-backend.h index b02b43a2ba..82537819d1 100644 --- a/include/ggml-backend.h +++ b/include/ggml-backend.h @@ -77,11 +77,6 @@ extern "C" { GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); - // by default each ggml_tensor is intended to be allocated exactly once - // if a tensor is allocated multiple times the pointers to its allocations need to be explicitly cleared with this function - // note that this does NOT free the memory for the actual allocations, that is done via e.g. ggml_backend_sched - GGML_API void ggml_backend_tensor_reset(struct ggml_tensor * tensor); - GGML_API void ggml_backend_synchronize(ggml_backend_t backend); GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); @@ -288,7 +283,7 @@ extern "C" { // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph. // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers. - // The correct way to use this API is to either discard all deallocated tensors or to reset them via ggml_backend_tensor_reset. + // The correct way to use this API is to discard the deallocated tensors and create new ones. GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); // Set a callback to be called for each resulting node during graph compute diff --git a/include/ggml-opt.h b/include/ggml-opt.h index 43e7b76f56..c868f98dca 100644 --- a/include/ggml-opt.h +++ b/include/ggml-opt.h @@ -25,8 +25,8 @@ extern "C" { // ====== Loss ====== - // built-in loss types the quantity minimized by the optimizer - // custom loss types can be defined via mean or sum which reduce the outputs for all datapoints to a single value + // built-in loss types, i.e. the built-in quantities minimized by the optimizer + // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value enum ggml_opt_loss_type { GGML_OPT_LOSS_TYPE_MEAN, GGML_OPT_LOSS_TYPE_SUM, @@ -71,16 +71,19 @@ extern "C" { } adamw; }; - // callback to calculate optimizer parameters with arbitrary data that can be set by the user + // callback to calculate optimizer parameters prior to a backward pass + // userdata can be used to pass arbitrary data typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata); + // returns the default optimizer params (constant) + // userdata is not used GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata); // parameters for initializing a new optimization context struct ggml_opt_params { - ggml_backend_sched_t backend_sched; + ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs - struct ggml_context * ctx_compute; + struct ggml_context * ctx_compute; // created in user code, holds non-static tensors // the forward graph is defined by inputs and outputs // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts @@ -126,10 +129,10 @@ extern "C" { GGML_API void ggml_opt_result_reset(ggml_opt_result_t result); // get data from result, uncertainties are optional and can be ignored by passing NULL - GGML_API void ggml_opt_result_ndata( ggml_opt_result_t result, int64_t * ndata); // write 1 value, number of datapoints - GGML_API void ggml_opt_result_loss( ggml_opt_result_t result, double * loss, double * unc); // write 1 value - GGML_API void ggml_opt_result_pred( ggml_opt_result_t result, int32_t * pred); // write ndata values - GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc); // write 1 value + GGML_API void ggml_opt_result_ndata( ggml_opt_result_t result, int64_t * ndata); // writes 1 value, number of datapoints + GGML_API void ggml_opt_result_loss( ggml_opt_result_t result, double * loss, double * unc); // writes 1 value + GGML_API void ggml_opt_result_pred( ggml_opt_result_t result, int32_t * pred); // writes ndata values + GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc); // writes 1 value // ====== Computation ====== diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp index b074b044ce..ec9d5f26c9 100644 --- a/src/ggml-backend.cpp +++ b/src/ggml-backend.cpp @@ -294,14 +294,6 @@ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size buf->iface.memset_tensor(buf, tensor, value, offset, size); } -void ggml_backend_tensor_reset(struct ggml_tensor * tensor) { - // the allocation/deallocation of the memory that these pointers point to is not the responsibility of ggml_tensor - // they can therefore simply be cleared without creating a memory leak - tensor->data = nullptr; - tensor->buffer = nullptr; - tensor->extra = nullptr; -} - void ggml_backend_synchronize(ggml_backend_t backend) { if (backend->iface.synchronize == NULL) { return; diff --git a/src/ggml-opt.cpp b/src/ggml-opt.cpp index d05291c00f..0802c84360 100644 --- a/src/ggml-opt.cpp +++ b/src/ggml-opt.cpp @@ -369,6 +369,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: { result->labels = ggml_dup_tensor(result->ctx_static, result->outputs); ggml_set_input(result->labels); + ggml_set_name(result->labels, "labels"); result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels); ggml_set_name(result->loss, "loss_cross_entropy"); if (result->opt_period > 1) { @@ -381,6 +382,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: { result->labels = ggml_dup_tensor(result->ctx_static, result->outputs); ggml_set_input(result->labels); + ggml_set_name(result->labels, "labels"); result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels); ggml_set_name(result->loss, "loss_error"); result->loss = ggml_sqr(result->ctx_static, result->loss); @@ -417,6 +419,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) { result->gb_opt = nullptr; result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0)); + result->buf_static_cpu = nullptr; ggml_opt_alloc_graph(result, result->gf); @@ -576,7 +579,7 @@ void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, doubl // ====== Computation ====== static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) { - { + if (graph != opt_ctx->gf) { struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud); GGML_ASSERT(opt_pars.adamw.alpha > 0.0f); diff --git a/src/ggml.c b/src/ggml.c index 8d94656d63..70c74014c3 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -6456,7 +6456,6 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { // initial gradients of loss should be 1, 0 otherwise struct ggml_tensor * grad = node->grad; while (grad && !grad->data && grad->view_src) { - GGML_ASSERT(grad->view_offs == 0); // FIXME grad = grad->view_src; } if (grad && grad->data) {