From 9c678e5239659b85ee05378fe5b9ead77f7f36ff Mon Sep 17 00:00:00 2001
From: Aron <aronpjansen@gmail.com>
Date: Tue, 26 Mar 2024 15:09:46 +0100
Subject: [PATCH 1/3] Add precommit hooks for formatting and notebook creation,
 and CI for formatting

---
 .github/workflows/Format.yaml | 21 +++++++++++++++++++++
 .gitignore                    |  2 --
 .pre-commit-config.yaml       | 14 ++++++++++++++
 CONTRIBUTING.md               | 15 +++++++++++++++
 build_notebooks.sh            | 13 +++++++++++++
 format_julia_files.jl         |  6 ++++++
 make_examples.jl              | 13 +++++++++++--
 7 files changed, 80 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/Format.yaml
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 CONTRIBUTING.md
 create mode 100755 build_notebooks.sh
 create mode 100644 format_julia_files.jl

diff --git a/.github/workflows/Format.yaml b/.github/workflows/Format.yaml
new file mode 100644
index 0000000..8f0d929
--- /dev/null
+++ b/.github/workflows/Format.yaml
@@ -0,0 +1,21 @@
+name: Format suggestions
+on:
+  push:
+  pull_request:
+jobs:
+  code-style:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/cache@v1
+      - uses: julia-actions/setup-julia@v1
+      - name: format code
+        run: |
+          julia -e '
+            import Pkg;
+            Pkg.add("JuliaFormatter");
+            using JuliaFormatter;
+            JuliaFormatter.format(".")
+          '
+      - name: Check for uncommitted changes
+        run: git diff --exit-code
diff --git a/.gitignore b/.gitignore
index 281e67a..cc5f4d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,3 @@
-examples/*.ipynb
-examples/*.md
 examples/test*
 .vscode
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..c9ee5b9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,14 @@
+repos:
+- repo: "local"
+  hooks:
+  - id: "format-julia"
+    name: "Format with JuliaFormatter"
+    entry: "julia format_julia_files.jl"
+    language: "system"
+
+  - id: "build-notebooks"
+    name: "Build Notebooks"
+    language: "script"
+    entry: "./build_notebooks.sh"
+    files: 'examples/.*\.jl$'
+
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..c077873
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,15 @@
+## Pre-commit
+We use pre-commit to run some checks before you commit your changes.
+
+To install it follow instructions [here](https://pre-commit.com/) and then run:
+
+```bash
+pre-commit install
+```
+
+in the root of the repository.
+
+Now every time you commit your changes, pre-commit will run a formatting check and create any notebooks whose source files have changed. 
+If it made any changes, you'll have to manually stage them and commit again.
+
+If for some reason you want to skip pre-commit checks, you can use the `--no-verify` flag when committing (perhaps to correct formatting in a separate commit for clarity).
diff --git a/build_notebooks.sh b/build_notebooks.sh
new file mode 100755
index 0000000..f9c25f8
--- /dev/null
+++ b/build_notebooks.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+echo "Building notebooks..."
+
+# pre-commit passes staged files as arguments
+FILENAMES=$(echo "$@" | grep 'examples/.*\.jl$')
+echo "Files changed: $FILENAMES"
+
+if [ -n "$FILENAMES" ]; then
+    julia make_examples.jl $FILENAMES
+else
+    echo "No Julia notebook source files changed."
+fi
diff --git a/format_julia_files.jl b/format_julia_files.jl
new file mode 100644
index 0000000..daf211b
--- /dev/null
+++ b/format_julia_files.jl
@@ -0,0 +1,6 @@
+using JuliaFormatter
+
+for file in ARGS
+    format(file)
+    println("Formatted $file")
+end
diff --git a/make_examples.jl b/make_examples.jl
index 76e1370..00631b0 100644
--- a/make_examples.jl
+++ b/make_examples.jl
@@ -3,8 +3,17 @@ using Glob
 
 cd("examples")
 
-files = glob("*-*.jl")
-overwrite_nb = false
+# if this is called with an argument, use that as filenames
+if length(ARGS) > 0
+    files = ARGS
+    # only check files in examples/ and remove the path
+    files = [f for f in files if occursin("examples/", f)]
+    files = [replace(f, r"examples/" => "") for f in files]
+else
+    files = glob("*-*.jl")
+end
+
+overwrite_nb = true
 autorun_notebooks = false
 
 for f in files

From 2c4da353d5e21d988e9a2968f49a5b3f780098f9 Mon Sep 17 00:00:00 2001
From: Aron <aronpjansen@gmail.com>
Date: Tue, 26 Mar 2024 15:19:49 +0100
Subject: [PATCH 2/3] Apply formatting

---
 examples/02.00-GrayScott.jl                      |  8 ++++++--
 .../coupling_functions/functions_CNODE_loss.jl   | 16 +++++++++-------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/examples/02.00-GrayScott.jl b/examples/02.00-GrayScott.jl
index 16766a7..507e7dd 100644
--- a/examples/02.00-GrayScott.jl
+++ b/examples/02.00-GrayScott.jl
@@ -37,8 +37,12 @@ k = 0.062;
 
 # Define the **right hand sides** of the two equations:
 include("coupling_functions/functions_FDderivatives.jl")
-F_u(u, v, grid_GS) = D_u * Laplacian(u, grid_GS.dux, grid_GS.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)
-G_v(u, v, grid_GS) = D_v * Laplacian(v, grid_GS.dvx, grid_GS.dvy) .+ u .* v .^ 2 .- (f + k) .* v
+function F_u(u, v, grid_GS)
+    D_u * Laplacian(u, grid_GS.dux, grid_GS.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)
+end
+function G_v(u, v, grid_GS)
+    D_v * Laplacian(v, grid_GS.dvx, grid_GS.dvy) .+ u .* v .^ 2 .- (f + k) .* v
+end
 
 # Once the functions have been defined, we can create the CNODE
 # Notice that in the future, this same constructor will be able to use the user provided neural network to close the equations
diff --git a/examples/coupling_functions/functions_CNODE_loss.jl b/examples/coupling_functions/functions_CNODE_loss.jl
index c0808ce..b6921ed 100644
--- a/examples/coupling_functions/functions_CNODE_loss.jl
+++ b/examples/coupling_functions/functions_CNODE_loss.jl
@@ -106,7 +106,8 @@ This function creates a random loss function for the multishooting method with m
 # Returns
 - `randloss_MulDtO`: A random loss function for the multishooting method.
 """
-function create_randloss_MulDtO(target; nunroll, nintervals = 1, noverlaps = 1, nsamples, λ_c, λ_l1)
+function create_randloss_MulDtO(
+        target; nunroll, nintervals = 1, noverlaps = 1, nsamples, λ_c, λ_l1)
     # TODO: there should be some check about the consistency of the input arguments
     # Get the number of time steps 
     d = ndims(target)
@@ -114,7 +115,7 @@ function create_randloss_MulDtO(target; nunroll, nintervals = 1, noverlaps = 1,
     function randloss_MulDtO(θ)
         # We compute the requested length of consecutive timesteps
         # Notice that each interval is long nunroll+1 because we are including the initial conditions as step_0 
-        length_required = nintervals*(nunroll+1) - noverlaps*(nintervals-1)
+        length_required = nintervals * (nunroll + 1) - noverlaps * (nintervals - 1)
         # Zygote will select a random initial condition that can accomodate all the multishooting intervals
         istart = Zygote.@ignore rand(1:(nt - length_required))
         trajectory = Zygote.@ignore ArrayType(selectdim(target,
@@ -165,11 +166,12 @@ function loss_MulDtO_oneset(trajectory,
     # Get the timesteps where the intervals start 
     #starting_points = [i*(nunroll+1-noverlaps) for i in 1:(nintervals-1)]
     #pushfirst!(starting_points,1)
-    starting_points = [i == 0 ? 1 : i*(nunroll+1-noverlaps) for i in 0:(nintervals-1)]
+    starting_points = [i == 0 ? 1 : i * (nunroll + 1 - noverlaps)
+                       for i in 0:(nintervals - 1)]
     # Take all the time intervals and concatenate them in the batch dimension
-    list_tr = cat([trajectory[:, :,i:(i + nunroll)]
+    list_tr = cat([trajectory[:, :, i:(i + nunroll)]
                    for i in starting_points]...,
-                   dims = 2)
+        dims = 2)
     # Get all the initial conditions 
     list_starts = cat([trajectory[:, :, i] for i in starting_points]...,
         dims = 2)
@@ -184,10 +186,10 @@ function loss_MulDtO_oneset(trajectory,
         # (!) Remind that the trajectory is stored as: 
         #   pred[grid, (nintervals*nsamples), nunroll+1]
         # and we need to compare the last noverlaps points of an interval
-        pred_end = pred[:, :, end-noverlaps+1:end]
+        pred_end = pred[:, :, (end - noverlaps + 1):end]
         # with the first noverlaps points of the next interval EXCLUDING the initial condition 
         # (which is already part of the loss function)
-        pred_start = pred[:, :, 2:(1+noverlaps)]
+        pred_start = pred[:, :, 2:(1 + noverlaps)]
         continuity = 0
         # loop over all the samples, which have been concatenated in dim 2
         for s in 1:nsamples

From 41890f3586a2b56434b5d53bd3aab52abadd2d09 Mon Sep 17 00:00:00 2001
From: Aron <aronpjansen@gmail.com>
Date: Tue, 26 Mar 2024 15:20:23 +0100
Subject: [PATCH 3/3] Add notebooks

---
 examples/00-Toy.ipynb                         |  60 ++
 examples/00-Toy.md                            |  16 +
 examples/01.00-Logistic.ipynb                 | 384 +++++++
 examples/01.00-Logistic.md                    | 180 ++++
 examples/02.00-GrayScott.ipynb                | 341 +++++++
 examples/02.00-GrayScott.md                   | 167 +++
 examples/02.01-GrayScott.ipynb                | 930 +++++++++++++++++
 examples/02.01-GrayScott.md                   | 521 ++++++++++
 examples/02.02-GrayScott.ipynb                | 884 ++++++++++++++++
 examples/02.02-GrayScott.md                   | 505 +++++++++
 examples/02.03-GrayScott.ipynb                | 724 +++++++++++++
 examples/02.03-GrayScott.md                   | 410 ++++++++
 examples/TODO_02.04-GrayScott.ipynb           | 954 ++++++++++++++++++
 examples/TODO_02.04-GrayScott.md              | 580 +++++++++++
 .../TODO_02.04-GrayScott_alternative.ipynb    | 941 +++++++++++++++++
 examples/TODO_02.04-GrayScott_alternative.md  | 577 +++++++++++
 examples/TODO_03.01-Advecion.ipynb            | 301 ++++++
 examples/TODO_03.01-Advecion.md               | 142 +++
 18 files changed, 8617 insertions(+)
 create mode 100644 examples/00-Toy.ipynb
 create mode 100644 examples/00-Toy.md
 create mode 100644 examples/01.00-Logistic.ipynb
 create mode 100644 examples/01.00-Logistic.md
 create mode 100644 examples/02.00-GrayScott.ipynb
 create mode 100644 examples/02.00-GrayScott.md
 create mode 100644 examples/02.01-GrayScott.ipynb
 create mode 100644 examples/02.01-GrayScott.md
 create mode 100644 examples/02.02-GrayScott.ipynb
 create mode 100644 examples/02.02-GrayScott.md
 create mode 100644 examples/02.03-GrayScott.ipynb
 create mode 100644 examples/02.03-GrayScott.md
 create mode 100644 examples/TODO_02.04-GrayScott.ipynb
 create mode 100644 examples/TODO_02.04-GrayScott.md
 create mode 100644 examples/TODO_02.04-GrayScott_alternative.ipynb
 create mode 100644 examples/TODO_02.04-GrayScott_alternative.md
 create mode 100644 examples/TODO_03.01-Advecion.ipynb
 create mode 100644 examples/TODO_03.01-Advecion.md

diff --git a/examples/00-Toy.ipynb b/examples/00-Toy.ipynb
new file mode 100644
index 0000000..f0e68eb
--- /dev/null
+++ b/examples/00-Toy.ipynb
@@ -0,0 +1,60 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Import the necessary packages"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using CoupledNODE"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's greet the world"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "helloworld()"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "\n",
+    "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
+   ],
+   "metadata": {}
+  }
+ ],
+ "nbformat_minor": 3,
+ "metadata": {
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.10.2"
+  },
+  "kernelspec": {
+   "name": "julia-1.10",
+   "display_name": "Julia 1.10.2",
+   "language": "julia"
+  }
+ },
+ "nbformat": 4
+}
diff --git a/examples/00-Toy.md b/examples/00-Toy.md
new file mode 100644
index 0000000..72b1219
--- /dev/null
+++ b/examples/00-Toy.md
@@ -0,0 +1,16 @@
+Import the necessary packages
+
+```julia
+using CoupledNODE
+```
+
+Let's greet the world
+
+```julia
+helloworld()
+```
+
+---
+
+*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
+
diff --git a/examples/01.00-Logistic.ipynb b/examples/01.00-Logistic.ipynb
new file mode 100644
index 0000000..82525c1
--- /dev/null
+++ b/examples/01.00-Logistic.ipynb
@@ -0,0 +1,384 @@
+{
+ "cells": [
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import CUDA\n",
+    "ArrayType = CUDA.functional() ? CuArray : Array\n",
+    "# Import our custom backend functions\n",
+    "include(\"coupling_functions/functions_example.jl\")\n",
+    "include(\"coupling_functions/functions_NODE.jl\")\n",
+    "include(\"coupling_functions/functions_loss.jl\");"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Logistic equation and NODE\n",
+    "Let's study a phenomenon that can be described with the following ODE $$\\dfrac{dP}{dt} = rP\\left(1-\\dfrac{P}{K}\\right),$$ which is called the logistic equation. Given $P(t=0)=P_0$ we can solve this problem analytically to get $P(t) = \\frac{K}{1+\\left(K-P_0\\right)/P_0 \\cdot e^{-rt}}$. Let's plot the solution for $r=K=2, P_0=0.01$:"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "r = 2\n",
+    "K = 1\n",
+    "P0 = 0.01\n",
+    "function P(t)\n",
+    "    return K / (1 + (K - P0) * exp(-r * t) / P0)\n",
+    "end\n",
+    "t = range(start = 0, stop = 6, step = 0.01)\n",
+    "Pt = P.(t)\n",
+    "plot(t, Pt, label = \"P(t)\", xlabel = \"t\", ylabel = \"P(t)\")"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's say that we want to use the logistic equation to model an experiment like the activation energy of a neuron. We run the experiment and we observe the following:"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "u_experiment = observation()\n",
+    "using Plots\n",
+    "plot(t, Pt, label = \"Best P(t) fit\")\n",
+    "plot!(t, u_experiment[:], label = \"Observation(t)\")"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "This means that our experimental system, despite its similarity, it is not described by a logistic ODE.\n",
+    "How can we then model our experiment as an ODE?\n",
+    "\n",
+    "There are many simpler alternatives for this example (e.g. Mode Decomposition, SINDY or Bayesian methods), but let's use this exercise to introduce a NODE:\n",
+    "$\\begin{equation}\\dfrac{du}{dt} = \\underbrace{ru\\left(1-\\dfrac{u}{K}\\right)}_{f(u)} + NN(u|\\theta).\\end{equation}$\n",
+    "In this NODE we are looking for a solution $u(t)$ that reproduces our observation.\n",
+    "We will be using [SciML](https://sciml.ai/) package [DiffEqFlux.jl](https://github.com/SciML/DiffEqFlux.jl) and scpecifically [NeuralODE](https://docs.sciml.ai/DiffEqFlux/stable/examples/neural_ode/) for defining and solving the problem.\n",
+    "## Solve the NODE\n",
+    "We solve this 1D NODE using introducing the functionalities of this repository:"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "* We create the NN using `Lux`. In this example we do not discuss the structure of the NN, but we leave it as a black box that can be designed by the user. We will show later how to take inspiration from the physics of the problem to design optimal NN."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import Lux\n",
+    "NN = Lux.Chain(Lux.SkipConnection(Lux.Dense(1, 3),\n",
+    "    (out, u) -> u * out[1] .+ u .* u .* out[2] .+ u .* log.(abs.(u)) .* out[3]));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "* We define the force $f(u)$ compatibly with SciML."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_u(u) = @. r * u * (1.0 - u / K);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "* We create the right hand side of the NODE, by combining the NN with f_u"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_NODE = create_f_NODE(NN, f_u; is_closed = true);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and get the parametrs that you want to train"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import Random\n",
+    "rng = Random.seed!(1234)\n",
+    "θ, st = Lux.setup(rng, f_NODE);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "* We define the NODE"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import DiffEqFlux: NeuralODE\n",
+    "import DifferentialEquations: Tsit5\n",
+    "trange = (0.0, 6.0)\n",
+    "u0 = [0.01]\n",
+    "full_NODE = NeuralODE(f_NODE, trange, Tsit5(), adaptive = false, dt = 0.001, saveat = 0.2);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "* We solve the NODE using the zero-initialized parameters\n",
+    "*Note:* `full_NODE` is a `NeuralODE` function that returns a `DiffEqBase.ODESolution` object. This object contains the solution of the ODE, but it also contains additional information like the time points at which the solution was evaluated and the parameters of the ODE. We can access the solution using `[1]`, and we convert it to an `Array` to be able to use it for further calculations and plot."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "untrained_NODE_solution = Array(full_NODE(u0, θ, st)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Prepare the model\n",
+    "First, we need to design the **loss function**. For this example, we use *multishooting a posteriori* fitting [(MulDtO)](https://docs.sciml.ai/DiffEqFlux/dev/examples/multiple_shooting/). Using `Zygote` we compare `nintervals` of length `nunroll` to get the gradient. Notice that this method is differentiating through the solution of the NODE!"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "nunroll = 60\n",
+    "nintervals = 10\n",
+    "myloss = create_randloss_MulDtO(u_experiment, nunroll = nunroll, nintervals = nintervals);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Second, we define this auxiliary NODE that will be used for training"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dt = 0.01 # it has to be as fine as the data\n",
+    "t_train_range = (0.0, dt * (nunroll + 1)) # it has to be as long as unroll\n",
+    "training_NODE = NeuralODE(f_NODE,\n",
+    "    t_train_range,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = dt);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "To initialize the training, we need some objects to monitor the procedure, and we trigger the first compilation."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "lhist = Float32[];"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Initialize and trigger the compilation of the model"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import ComponentArrays\n",
+    "pinit = ComponentArrays.ComponentArray(θ);\n",
+    "myloss(pinit); # trigger compilation"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Select the autodifferentiation type"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using OptimizationOptimisers\n",
+    "adtype = Optimization.AutoZygote();"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We transform the NeuralODE into an optimization problem"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "optf = Optimization.OptimizationFunction((x, p) -> myloss(x), adtype);\n",
+    "optprob = Optimization.OptimizationProblem(optf, pinit);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Select the training algorithm:\n",
+    "We choose Adam with learning rate 0.1, with gradient clipping"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "ClipAdam = OptimiserChain(Adam(1.0e-1), ClipGrad(1));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Train de NODE\n",
+    "We are ready to train the NODE.\n",
+    "Notice that the block can be repeated to continue training"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "result_neuralode = Optimization.solve(optprob,\n",
+    "    ClipAdam;\n",
+    "    # Commented out the line that uses a custom callback to track loss over time\n",
+    "    ##callback = callback,\n",
+    "    maxiters = 100)\n",
+    "pinit = result_neuralode.u;\n",
+    "optprob = Optimization.OptimizationProblem(optf, pinit);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Analyse the results\n",
+    "Visualize the obtained results"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "plot()\n",
+    "plot(t, Pt, label = \"Best P(t) fit\")\n",
+    "plot!(t, u_experiment[:], label = \"Observation(t)\")\n",
+    "scatter!(range(start = 0, stop = 6, step = 0.2),\n",
+    "    untrained_NODE_solution[:],\n",
+    "    label = \"untrained NODE\",\n",
+    "    marker = :circle)\n",
+    "scatter!(range(start = 0, stop = 6, step = 0.2),\n",
+    "    Array(full_NODE([u_experiment[1]], result_neuralode.u, st)[1])[:],\n",
+    "    label = \"Trained NODE\",\n",
+    "    marker = :circle)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "\n",
+    "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
+   ],
+   "metadata": {}
+  }
+ ],
+ "nbformat_minor": 3,
+ "metadata": {
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.10.2"
+  },
+  "kernelspec": {
+   "name": "julia-1.10",
+   "display_name": "Julia 1.10.2",
+   "language": "julia"
+  }
+ },
+ "nbformat": 4
+}
diff --git a/examples/01.00-Logistic.md b/examples/01.00-Logistic.md
new file mode 100644
index 0000000..63e7555
--- /dev/null
+++ b/examples/01.00-Logistic.md
@@ -0,0 +1,180 @@
+```julia
+import CUDA
+ArrayType = CUDA.functional() ? CuArray : Array
+# Import our custom backend functions
+include("coupling_functions/functions_example.jl")
+include("coupling_functions/functions_NODE.jl")
+include("coupling_functions/functions_loss.jl");
+```
+
+# Logistic equation and NODE
+Let's study a phenomenon that can be described with the following ODE $$\dfrac{dP}{dt} = rP\left(1-\dfrac{P}{K}\right),$$ which is called the logistic equation. Given $P(t=0)=P_0$ we can solve this problem analytically to get $P(t) = \frac{K}{1+\left(K-P_0\right)/P_0 \cdot e^{-rt}}$. Let's plot the solution for $r=K=2, P_0=0.01$:
+
+```julia
+r = 2
+K = 1
+P0 = 0.01
+function P(t)
+    return K / (1 + (K - P0) * exp(-r * t) / P0)
+end
+t = range(start = 0, stop = 6, step = 0.01)
+Pt = P.(t)
+plot(t, Pt, label = "P(t)", xlabel = "t", ylabel = "P(t)")
+```
+
+Let's say that we want to use the logistic equation to model an experiment like the activation energy of a neuron. We run the experiment and we observe the following:
+
+```julia
+u_experiment = observation()
+using Plots
+plot(t, Pt, label = "Best P(t) fit")
+plot!(t, u_experiment[:], label = "Observation(t)")
+```
+
+This means that our experimental system, despite its similarity, it is not described by a logistic ODE.
+How can we then model our experiment as an ODE?
+
+There are many simpler alternatives for this example (e.g. Mode Decomposition, SINDY or Bayesian methods), but let's use this exercise to introduce a NODE:
+$\begin{equation}\dfrac{du}{dt} = \underbrace{ru\left(1-\dfrac{u}{K}\right)}_{f(u)} + NN(u|\theta).\end{equation}$
+In this NODE we are looking for a solution $u(t)$ that reproduces our observation.
+We will be using [SciML](https://sciml.ai/) package [DiffEqFlux.jl](https://github.com/SciML/DiffEqFlux.jl) and scpecifically [NeuralODE](https://docs.sciml.ai/DiffEqFlux/stable/examples/neural_ode/) for defining and solving the problem.
+## Solve the NODE
+We solve this 1D NODE using introducing the functionalities of this repository:
+
+* We create the NN using `Lux`. In this example we do not discuss the structure of the NN, but we leave it as a black box that can be designed by the user. We will show later how to take inspiration from the physics of the problem to design optimal NN.
+
+```julia
+import Lux
+NN = Lux.Chain(Lux.SkipConnection(Lux.Dense(1, 3),
+    (out, u) -> u * out[1] .+ u .* u .* out[2] .+ u .* log.(abs.(u)) .* out[3]));
+```
+
+* We define the force $f(u)$ compatibly with SciML.
+
+```julia
+f_u(u) = @. r * u * (1.0 - u / K);
+```
+
+* We create the right hand side of the NODE, by combining the NN with f_u
+
+```julia
+f_NODE = create_f_NODE(NN, f_u; is_closed = true);
+```
+
+and get the parametrs that you want to train
+
+```julia
+import Random
+rng = Random.seed!(1234)
+θ, st = Lux.setup(rng, f_NODE);
+```
+
+* We define the NODE
+
+```julia
+import DiffEqFlux: NeuralODE
+import DifferentialEquations: Tsit5
+trange = (0.0, 6.0)
+u0 = [0.01]
+full_NODE = NeuralODE(f_NODE, trange, Tsit5(), adaptive = false, dt = 0.001, saveat = 0.2);
+```
+
+* We solve the NODE using the zero-initialized parameters
+*Note:* `full_NODE` is a `NeuralODE` function that returns a `DiffEqBase.ODESolution` object. This object contains the solution of the ODE, but it also contains additional information like the time points at which the solution was evaluated and the parameters of the ODE. We can access the solution using `[1]`, and we convert it to an `Array` to be able to use it for further calculations and plot.
+
+```julia
+untrained_NODE_solution = Array(full_NODE(u0, θ, st)[1]);
+```
+
+## Prepare the model
+First, we need to design the **loss function**. For this example, we use *multishooting a posteriori* fitting [(MulDtO)](https://docs.sciml.ai/DiffEqFlux/dev/examples/multiple_shooting/). Using `Zygote` we compare `nintervals` of length `nunroll` to get the gradient. Notice that this method is differentiating through the solution of the NODE!
+
+```julia
+nunroll = 60
+nintervals = 10
+myloss = create_randloss_MulDtO(u_experiment, nunroll = nunroll, nintervals = nintervals);
+```
+
+Second, we define this auxiliary NODE that will be used for training
+
+```julia
+dt = 0.01 # it has to be as fine as the data
+t_train_range = (0.0, dt * (nunroll + 1)) # it has to be as long as unroll
+training_NODE = NeuralODE(f_NODE,
+    t_train_range,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = dt);
+```
+
+To initialize the training, we need some objects to monitor the procedure, and we trigger the first compilation.
+
+```julia
+lhist = Float32[];
+```
+
+Initialize and trigger the compilation of the model
+
+```julia
+import ComponentArrays
+pinit = ComponentArrays.ComponentArray(θ);
+myloss(pinit); # trigger compilation
+```
+
+Select the autodifferentiation type
+
+```julia
+using OptimizationOptimisers
+adtype = Optimization.AutoZygote();
+```
+
+We transform the NeuralODE into an optimization problem
+
+```julia
+optf = Optimization.OptimizationFunction((x, p) -> myloss(x), adtype);
+optprob = Optimization.OptimizationProblem(optf, pinit);
+```
+
+Select the training algorithm:
+We choose Adam with learning rate 0.1, with gradient clipping
+
+```julia
+ClipAdam = OptimiserChain(Adam(1.0e-1), ClipGrad(1));
+```
+
+## Train de NODE
+We are ready to train the NODE.
+Notice that the block can be repeated to continue training
+
+```julia
+result_neuralode = Optimization.solve(optprob,
+    ClipAdam;
+    # Commented out the line that uses a custom callback to track loss over time
+    ##callback = callback,
+    maxiters = 100)
+pinit = result_neuralode.u;
+optprob = Optimization.OptimizationProblem(optf, pinit);
+```
+
+## Analyse the results
+Visualize the obtained results
+
+```julia
+plot()
+plot(t, Pt, label = "Best P(t) fit")
+plot!(t, u_experiment[:], label = "Observation(t)")
+scatter!(range(start = 0, stop = 6, step = 0.2),
+    untrained_NODE_solution[:],
+    label = "untrained NODE",
+    marker = :circle)
+scatter!(range(start = 0, stop = 6, step = 0.2),
+    Array(full_NODE([u_experiment[1]], result_neuralode.u, st)[1])[:],
+    label = "Trained NODE",
+    marker = :circle)
+```
+
+---
+
+*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
+
diff --git a/examples/02.00-GrayScott.ipynb b/examples/02.00-GrayScott.ipynb
new file mode 100644
index 0000000..a7435a4
--- /dev/null
+++ b/examples/02.00-GrayScott.ipynb
@@ -0,0 +1,341 @@
+{
+ "cells": [
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import CUDA\n",
+    "ArrayType = CUDA.functional() ? CuArray : Array;\n",
+    "# Import our custom backend functions\n",
+    "include(\"coupling_functions/functions_NODE.jl\")"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Gray-Scott model - explicit solution\n",
+    "In following examples we will use the GS model to showcase how it can be represented as Coupled Neural ODEs (CNODEs). But let us first explore the GS model starting with an explicit solution of it. We will be using [SciML](https://sciml.ai/) package [DiffEqFlux.jl`](https://github.com/SciML/DiffEqFlux.jl) and scpecifically [`NeuralODE`](https://docs.sciml.ai/DiffEqFlux/stable/examples/neural_ode/) for defining and solving the problem."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The system that we want to solve, called the the Gray-Scott model, is defined by the following equations:\n",
+    "\\begin{equation}\\begin{cases} \\frac{du}{dt} = D_u \\Delta u - uv^2 + f(1-u)  \\equiv F_u(u,v) \\\\ \\frac{dv}{dt} = D_v \\Delta v + uv^2 - (f+k)v  \\equiv G_v(u,v)\\end{cases} \\end{equation}\n",
+    "where $u(x,y,t):\\mathbb{R}^2\\times \\mathbb{R}\\rightarrow \\mathbb{R}$ is the concentration of species 1, while $v(x,y,t)$ is the concentration of species 2. This model reproduce the effect of the two species diffusing in their environment and reacting together.\n",
+    "This effect is captured by the ratios between $D_u$ and $D_v$ (the diffusion coefficients) and $f$ and $k$ (the reaction rates)."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's start creating a grid to discretize the problem. Notice that in literature the coefficients are usually scaled such that $dx=dy=1$, so we will use this scaling to have a direct comparison with literature."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dux = duy = dvx = dvy = 1.0\n",
+    "nux = nuy = nvx = nvy = 100\n",
+    "grid_GS = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We start defining a central concentration of $v$ and a constant concentration of $u$:"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "function initialize_uv(grid, u_bkg, v_bkg, center_size)\n",
+    "    u_initial = u_bkg * ones(grid.nux, grid.nuy)\n",
+    "    v_initial = zeros(grid.nvx, grid.nvy)\n",
+    "    v_initial[Int(grid.nvx / 2 - center_size):Int(grid.nvx / 2 + center_size), Int(grid.nvy / 2 - center_size):Int(grid.nvy / 2 + center_size)] .= v_bkg\n",
+    "    return u_initial, v_initial\n",
+    "end\n",
+    "u_initial, v_initial = initialize_uv(grid_GS, 0.8, 0.9, 4);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can now define the initial condition as a flattened concatenated array"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = vcat(reshape(u_initial, grid_GS.nux * grid_GS.nuy, 1),\n",
+    "    reshape(v_initial, grid_GS.nvx * grid_GS.nvy, 1));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "From the literature, we select the following parameters in order to form nice patterns."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "D_u = 0.16\n",
+    "D_v = 0.08\n",
+    "f = 0.055\n",
+    "k = 0.062;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Define the **right hand sides** of the two equations:"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_FDderivatives.jl\")\n",
+    "function F_u(u, v, grid_GS)\n",
+    "    D_u * Laplacian(u, grid_GS.dux, grid_GS.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)\n",
+    "end\n",
+    "function G_v(u, v, grid_GS)\n",
+    "    D_v * Laplacian(v, grid_GS.dvx, grid_GS.dvy) .+ u .* v .^ 2 .- (f + k) .* v\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Once the functions have been defined, we can create the CNODE\n",
+    "Notice that in the future, this same constructor will be able to use the user provided neural network to close the equations"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_CNODE = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and we ask Lux for the parameters to train and their structure"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using Lux\n",
+    "import Random\n",
+    "rng = Random.seed!(1234);\n",
+    "θ, st = Lux.setup(rng, f_CNODE);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "in this example we are not training any parameters, so we can confirm that the vector θ is empty"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "length(θ) == 0;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We now do a short *burnout run* to get rid of the initial artifacts. This allows us to discard the transient dynamics and to have a good initial condition for the data collection run."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using DifferentialEquations: Tsit5\n",
+    "using DiffEqFlux: NeuralODE\n",
+    "trange_burn = (0.0, 10.0);\n",
+    "dt, saveat = (1e-2, 1);\n",
+    "full_CNODE = NeuralODE(f_CNODE,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "burnout_CNODE_solution = Array(full_CNODE(uv0, θ, st)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "**CNODE run**"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We use the output of the *burnout run* to start a longer simulation"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = burnout_CNODE_solution[:, :, end];\n",
+    "trange = (0.0, 8000.0);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "the maximum suggested time step for GS is defined as `1/(4 * Dmax)`"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dt, saveat = (1 / (4 * max(D_u, D_v)), 25);\n",
+    "full_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat);\n",
+    "untrained_CNODE_solution = Array(full_CNODE(uv0, θ, st)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "And we unpack the solution to get the two species. Remember that we have concatenated $u$ and $v$ in the same array."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "u = reshape(untrained_CNODE_solution[1:(grid_GS.Nu), :, :],\n",
+    "    grid_GS.nux,\n",
+    "    grid_GS.nuy,\n",
+    "    size(untrained_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v = reshape(untrained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],\n",
+    "    grid_GS.nvx,\n",
+    "    grid_GS.nvy,\n",
+    "    size(untrained_CNODE_solution, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Finally, plot the solution as an animation"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using Plots\n",
+    "anim = Animation()\n",
+    "fig = plot(layout = (1, 2), size = (600, 300))\n",
+    "@gif for i in 1:2:size(u, 4)\n",
+    "    p1 = heatmap(u[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        bar = true,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u(x,y)\")\n",
+    "    p2 = heatmap(v[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        bar = true,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v(x,y)\")\n",
+    "    time = round(i * saveat, digits = 0)\n",
+    "    fig = plot(p1, p2, layout = (1, 2), plot_title = \"time = $(time)\")\n",
+    "    frame(anim, fig)\n",
+    "end\n",
+    "if isdir(\"./plots\")\n",
+    "    gif(anim, \"./plots/02.00.GS.gif\", fps = 10)\n",
+    "else\n",
+    "    gif(anim, \"examples/plots/02.00.GS.gif\", fps = 10)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "\n",
+    "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
+   ],
+   "metadata": {}
+  }
+ ],
+ "nbformat_minor": 3,
+ "metadata": {
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.10.2"
+  },
+  "kernelspec": {
+   "name": "julia-1.10",
+   "display_name": "Julia 1.10.2",
+   "language": "julia"
+  }
+ },
+ "nbformat": 4
+}
diff --git a/examples/02.00-GrayScott.md b/examples/02.00-GrayScott.md
new file mode 100644
index 0000000..d3f3a2d
--- /dev/null
+++ b/examples/02.00-GrayScott.md
@@ -0,0 +1,167 @@
+```julia
+import CUDA
+ArrayType = CUDA.functional() ? CuArray : Array;
+# Import our custom backend functions
+include("coupling_functions/functions_NODE.jl")
+```
+
+# Gray-Scott model - explicit solution
+In following examples we will use the GS model to showcase how it can be represented as Coupled Neural ODEs (CNODEs). But let us first explore the GS model starting with an explicit solution of it. We will be using [SciML](https://sciml.ai/) package [DiffEqFlux.jl`](https://github.com/SciML/DiffEqFlux.jl) and scpecifically [`NeuralODE`](https://docs.sciml.ai/DiffEqFlux/stable/examples/neural_ode/) for defining and solving the problem.
+
+The system that we want to solve, called the the Gray-Scott model, is defined by the following equations:
+\begin{equation}\begin{cases} \frac{du}{dt} = D_u \Delta u - uv^2 + f(1-u)  \equiv F_u(u,v) \\ \frac{dv}{dt} = D_v \Delta v + uv^2 - (f+k)v  \equiv G_v(u,v)\end{cases} \end{equation}
+where $u(x,y,t):\mathbb{R}^2\times \mathbb{R}\rightarrow \mathbb{R}$ is the concentration of species 1, while $v(x,y,t)$ is the concentration of species 2. This model reproduce the effect of the two species diffusing in their environment and reacting together.
+This effect is captured by the ratios between $D_u$ and $D_v$ (the diffusion coefficients) and $f$ and $k$ (the reaction rates).
+
+Let's start creating a grid to discretize the problem. Notice that in literature the coefficients are usually scaled such that $dx=dy=1$, so we will use this scaling to have a direct comparison with literature.
+
+```julia
+dux = duy = dvx = dvy = 1.0
+nux = nuy = nvx = nvy = 100
+grid_GS = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);
+```
+
+We start defining a central concentration of $v$ and a constant concentration of $u$:
+
+```julia
+function initialize_uv(grid, u_bkg, v_bkg, center_size)
+    u_initial = u_bkg * ones(grid.nux, grid.nuy)
+    v_initial = zeros(grid.nvx, grid.nvy)
+    v_initial[Int(grid.nvx / 2 - center_size):Int(grid.nvx / 2 + center_size), Int(grid.nvy / 2 - center_size):Int(grid.nvy / 2 + center_size)] .= v_bkg
+    return u_initial, v_initial
+end
+u_initial, v_initial = initialize_uv(grid_GS, 0.8, 0.9, 4);
+```
+
+We can now define the initial condition as a flattened concatenated array
+
+```julia
+uv0 = vcat(reshape(u_initial, grid_GS.nux * grid_GS.nuy, 1),
+    reshape(v_initial, grid_GS.nvx * grid_GS.nvy, 1));
+```
+
+From the literature, we select the following parameters in order to form nice patterns.
+
+```julia
+D_u = 0.16
+D_v = 0.08
+f = 0.055
+k = 0.062;
+```
+
+Define the **right hand sides** of the two equations:
+
+```julia
+include("coupling_functions/functions_FDderivatives.jl")
+function F_u(u, v, grid_GS)
+    D_u * Laplacian(u, grid_GS.dux, grid_GS.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)
+end
+function G_v(u, v, grid_GS)
+    D_v * Laplacian(v, grid_GS.dvx, grid_GS.dvy) .+ u .* v .^ 2 .- (f + k) .* v
+end
+```
+
+Once the functions have been defined, we can create the CNODE
+Notice that in the future, this same constructor will be able to use the user provided neural network to close the equations
+
+```julia
+f_CNODE = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false);
+```
+
+and we ask Lux for the parameters to train and their structure
+
+```julia
+using Lux
+import Random
+rng = Random.seed!(1234);
+θ, st = Lux.setup(rng, f_CNODE);
+```
+
+in this example we are not training any parameters, so we can confirm that the vector θ is empty
+
+```julia
+length(θ) == 0;
+```
+
+We now do a short *burnout run* to get rid of the initial artifacts. This allows us to discard the transient dynamics and to have a good initial condition for the data collection run.
+
+```julia
+using DifferentialEquations: Tsit5
+using DiffEqFlux: NeuralODE
+trange_burn = (0.0, 10.0);
+dt, saveat = (1e-2, 1);
+full_CNODE = NeuralODE(f_CNODE,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+burnout_CNODE_solution = Array(full_CNODE(uv0, θ, st)[1]);
+```
+
+**CNODE run**
+
+We use the output of the *burnout run* to start a longer simulation
+
+```julia
+uv0 = burnout_CNODE_solution[:, :, end];
+trange = (0.0, 8000.0);
+```
+
+the maximum suggested time step for GS is defined as `1/(4 * Dmax)`
+
+```julia
+dt, saveat = (1 / (4 * max(D_u, D_v)), 25);
+full_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat);
+untrained_CNODE_solution = Array(full_CNODE(uv0, θ, st)[1]);
+```
+
+And we unpack the solution to get the two species. Remember that we have concatenated $u$ and $v$ in the same array.
+
+```julia
+u = reshape(untrained_CNODE_solution[1:(grid_GS.Nu), :, :],
+    grid_GS.nux,
+    grid_GS.nuy,
+    size(untrained_CNODE_solution, 2),
+    :);
+v = reshape(untrained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],
+    grid_GS.nvx,
+    grid_GS.nvy,
+    size(untrained_CNODE_solution, 2),
+    :);
+```
+
+Finally, plot the solution as an animation
+
+```julia
+using Plots
+anim = Animation()
+fig = plot(layout = (1, 2), size = (600, 300))
+@gif for i in 1:2:size(u, 4)
+    p1 = heatmap(u[:, :, 1, i],
+        axis = false,
+        bar = true,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u(x,y)")
+    p2 = heatmap(v[:, :, 1, i],
+        axis = false,
+        bar = true,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v(x,y)")
+    time = round(i * saveat, digits = 0)
+    fig = plot(p1, p2, layout = (1, 2), plot_title = "time = $(time)")
+    frame(anim, fig)
+end
+if isdir("./plots")
+    gif(anim, "./plots/02.00.GS.gif", fps = 10)
+else
+    gif(anim, "examples/plots/02.00.GS.gif", fps = 10)
+end
+```
+
+---
+
+*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
+
diff --git a/examples/02.01-GrayScott.ipynb b/examples/02.01-GrayScott.ipynb
new file mode 100644
index 0000000..093fa87
--- /dev/null
+++ b/examples/02.01-GrayScott.ipynb
@@ -0,0 +1,930 @@
+{
+ "cells": [
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import CUDA\n",
+    "ArrayType = CUDA.functional() ? CuArray : Array;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Learning the Gray-Scott model: a priori fitting"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In the previous example [02.00-GrayScott.jl](02.00-GrayScott.jl) we have seen how to use the CNODE to solve the Gray-Scott model via an explicit method.\n",
+    "Here we introduce the *Learning part* of the CNODE, and show how it can be used to close the CNODE. We are going to train the neural network via **a priori fitting** and in the next example [02.02-GrayScott](02.02-GrayScott.jl) we will show how to use a posteriori fitting."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "As a reminder, the GS model is defined from\n",
+    "\\begin{equation}\\begin{cases} \\frac{du}{dt} = D_u \\Delta u - uv^2 + f(1-u)  \\equiv F_u(u,v) \\\\ \\frac{dv}{dt} = D_v \\Delta v + uv^2 - (f+k)v  \\equiv G_v(u,v)\\end{cases} \\end{equation}\n",
+    "where $u(x,y,t):\\mathbb{R}^2\\times \\mathbb{R}\\rightarrow \\mathbb{R}$ is the concentration of species 1, while $v(x,y,t)$ is the concentration of species two. This model reproduce the effect of the two species diffusing in their environment, and reacting together.\n",
+    "This effect is captured by the ratios between $D_u$ and $D_v$ (diffusion coefficients) and $f$ and $k$ (reaction rates)."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In this example, we will first (I) use the exact GS model to gather some data\n",
+    "then in the second part (II), we will train a generic CNODE to show that it can learn the GS model from the data."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## I. Solving GS to collect data\n",
+    "Definition of the grid"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_NODE.jl\")\n",
+    "dux = duy = dvx = dvy = 1.0\n",
+    "nux = nuy = nvx = nvy = 64\n",
+    "grid_GS = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Definition of the initial condition as a random perturbation over a constant background to add variety.\n",
+    "Notice that this initial conditions are different from those of the previous example."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import Random\n",
+    "function initial_condition(grid, U₀, V₀, ε_u, ε_v; nsimulations = 1)\n",
+    "    u_init = U₀ .+ ε_u .* Random.randn(grid.nux, grid.nuy, nsimulations)\n",
+    "    v_init = V₀ .+ ε_v .* Random.randn(grid.nvx, grid.nvy, nsimulations)\n",
+    "    return u_init, v_init\n",
+    "end\n",
+    "U₀ = 0.5    # initial concentration of u\n",
+    "V₀ = 0.25   # initial concentration of v\n",
+    "ε_u = 0.05  # magnitude of the perturbation on u\n",
+    "ε_v = 0.1   # magnitude of the perturbation on v\n",
+    "u_initial, v_initial = initial_condition(grid_GS, U₀, V₀, ε_u, ε_v, nsimulations = 20);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We define the initial condition as a flattened concatenated array"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = vcat(reshape(u_initial, grid_GS.Nu, :), reshape(v_initial, grid_GS.Nv, :));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "These are the GS parameters (also used in example 02.01) that we will try to learn"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "D_u = 0.16\n",
+    "D_v = 0.08\n",
+    "f = 0.055\n",
+    "k = 0.062;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Exact right hand sides (functions) of the GS model"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_FDderivatives.jl\")\n",
+    "F_u(u, v, grid) = D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)\n",
+    "G_v(u, v, grid) = D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2 .- (f + k) .* v"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Definition of the CNODE"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import Lux\n",
+    "f_CNODE = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false);\n",
+    "rng = Random.seed!(1234);\n",
+    "θ_0, st_0 = Lux.setup(rng, f_CNODE);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "**Burnout run:** to discard the results of the initial conditions.\n",
+    "In this case we need 2 burnouts: first one with a relatively large time step and then another one with a smaller time step. This allow us to discard the transient dynamics and to have a good initial condition for the data collection run."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import DifferentialEquations: Tsit5\n",
+    "import DiffEqFlux: NeuralODE\n",
+    "trange_burn = (0.0, 1.0)\n",
+    "dt, saveat = (1e-2, 1)\n",
+    "burnout_CNODE = NeuralODE(f_CNODE,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "burnout_CNODE_solution = Array(burnout_CNODE(uv0, θ_0, st_0)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Second burnout with a smaller timestep"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trange_burn = (0.0, 500.0)\n",
+    "dt, saveat = (1 / (4 * max(D_u, D_v)), 100)\n",
+    "burnout_CNODE = NeuralODE(f_CNODE,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "burnout_CNODE_solution = Array(burnout_CNODE(burnout_CNODE_solution[:, :, end], θ_0, st_0)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Data collection run"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = burnout_CNODE_solution[:, :, end];\n",
+    "trange = (0.0, 2000.0);\n",
+    "dt, saveat = (1 / (4 * max(D_u, D_v)), 1);\n",
+    "GS_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat);\n",
+    "GS_sim = Array(GS_CNODE(uv0, θ_0, st_0)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "`GS_sim` contains the solutions of $u$ and $v$ for the specified `trange` and `nsimulations` initial conditions. If you explore `GS_sim` you will see that it has the shape `((nux * nuy) + (nvx * nvy), nsimulations, timesteps)`.\n",
+    "`uv_data` is a reshaped version of `GS_sim` that has the shape `(nux * nuy + nvx * nvy, nsimulations * timesteps)`. This is the format that we will use to train the CNODE."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv_data = reshape(GS_sim, size(GS_sim, 1), size(GS_sim, 2) * size(GS_sim, 3));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We define `FG_target` containing the right hand sides (i.e. $\\frac{du}{dt} and \\frac{dv}{dt}$) of each one of the samples. We will see later that for the training `FG_target` is used as the labels to do derivative fitting."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "FG_target = Array(f_CNODE(uv_data, θ_0, st_0)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## II. Training a CNODE to learn the GS model via a priori training\n",
+    "To learn the GS model, we will use the following CNODE\n",
+    "\\begin{equation}\\begin{cases} \\frac{du}{dt} = D_u \\Delta u - uv^2 + \\theta_{u,1} u +\\theta_{u,2} v +\\theta_{u,3}  \\\\ \\frac{dv}{dt} = D_v \\Delta v + uv^2 + \\theta_{v,1} u +\\theta_{v,2} v +\\theta_{v,3} \\end{cases} \\end{equation}\n",
+    "In this example the deterministic function contains the diffusion and the coupling terms, while the model has to learn the source and death terms.\n",
+    "The deterministic functions of the two coupled equations are:"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import Zygote\n",
+    "F_u_open(u, v, grid) = Zygote.@ignore D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2;\n",
+    "G_v_open(u, v, grid) = Zygote.@ignore D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We tell Zygote to ignore this tree branch for the gradient propagation."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "For the trainable part, we define an abstract Lux layer"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "struct GSLayer{F} <: Lux.AbstractExplicitLayer\n",
+    "    init_weight::F\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and its (outside) constructor"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "function GSLayer(; init_weight = Lux.zeros32)\n",
+    "    #function GSLayer(; init_weight = glorot_uniform)\n",
+    "    return GSLayer(init_weight)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We also need to specify how to initialize its parameters and states.\n",
+    "This custom layer does not have any hidden states (RNGs) that are modified."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "function Lux.initialparameters(rng::Random.AbstractRNG, (; init_weight)::GSLayer)\n",
+    "    (;\n",
+    "        gs_weights = init_weight(rng, 3),)\n",
+    "end\n",
+    "Lux.initialstates(::Random.AbstractRNG, ::GSLayer) = (;)\n",
+    "Lux.parameterlength((;)::GSLayer) = 3\n",
+    "Lux.statelength(::GSLayer) = 0"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We define how to pass inputs through `GSlayer`, assuming the following:\n",
+    "- Input size: `(N, N, 2, nsample)`, where the two channels are $u$ and $v$.\n",
+    "- Output size: `(N, N, nsample)` where we assumed monochannel output, so we dropped the channel dimension."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "This is what each layer does. Notice that the layer does not modify the state."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "function ((;)::GSLayer)(x, params, state)\n",
+    "    u = x[:, :, 1, :]\n",
+    "    v = x[:, :, 2, :]\n",
+    "    out = params.gs_weights[1] .* u .+ params.gs_weights[2] .* v .+ params.gs_weights[3]\n",
+    "    out, state\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We create the trainable models. In this case is just a GS layer, but the model can be as complex as needed."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "NN_u = GSLayer()\n",
+    "NN_v = GSLayer()"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can now close the CNODE with the Neural Network"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_closed_CNODE = create_f_CNODE(F_u_open, G_v_open, grid_GS, NN_u, NN_v; is_closed = true)\n",
+    "θ, st = Lux.setup(rng, f_closed_CNODE);\n",
+    "print(θ)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Check that the closed CNODE can reproduce the GS model if the parameters are set to the correct values"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import ComponentArrays\n",
+    "correct_w_u = [-f, 0, f];\n",
+    "correct_w_v = [0, -(f + k), 0];\n",
+    "θ_correct = ComponentArrays.ComponentArray(θ);\n",
+    "θ_correct.layer_3.layer_1.gs_weights = correct_w_u;\n",
+    "θ_correct.layer_3.layer_2.gs_weights = correct_w_v;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Notice that they are the same within a tolerance of 1e-7"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "isapprox(f_closed_CNODE(GS_sim[:, 1, 1], θ_correct, st)[1],\n",
+    "    f_CNODE(GS_sim[:, 1, 1], θ_0, st_0)[1],\n",
+    "    atol = 1e-7,\n",
+    "    rtol = 1e-7)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "but now with a tolerance of 1e-8 this check returns `false`."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "isapprox(f_closed_CNODE(GS_sim[:, 1, 1], θ_correct, st)[1],\n",
+    "    f_CNODE(GS_sim[:, 1, 1], θ_0, st_0)[1],\n",
+    "    atol = 1e-8,\n",
+    "    rtol = 1e-8)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In a chaotic system like GS, this would be enough to produce different dynamics, so be careful about this"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "If you have problems with training the model, you can cheat and start from the solution to check your implementation:\n",
+    "```julia\n",
+    "θ.layer_3.layer_1.gs_weights = correct_w_u\n",
+    "θ.layer_3.layer_2.gs_weights = correct_w_v\n",
+    "pinit = θ\n",
+    "```"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Design the loss function - a priori fitting\n",
+    "For this example, we use *a priori* fitting. In this approach, the loss function is defined to minimize the difference between the derivatives of $\\frac{du}{dt}$ and $\\frac{dv}{dt}$ predicted by the model and calculated via explicit method `FG_target`.\n",
+    "In practice, we use [Zygote](https://fluxml.ai/Zygote.jl/stable/) to compare the right hand side of the GS model with the right hand side of the CNODE, and we ask it to minimize the difference."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_CNODE_loss.jl\");\n",
+    "myloss = create_randloss_derivative(uv_data,\n",
+    "    FG_target,\n",
+    "    f_closed_CNODE,\n",
+    "    st;\n",
+    "    nuse = 64,\n",
+    "    λ = 0);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "To initialize the training, we need some objects to monitor the procedure, and we trigger the first compilation."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "lhist = Float32[];\n",
+    "# Initialize and trigger the compilation of the model\n",
+    "pinit = ComponentArrays.ComponentArray(θ);\n",
+    "myloss(pinit);\n",
+    "# [!] Check that the loss does not get type warnings, otherwise it will be slower"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We transform the NeuralODE into an optimization problem"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "# Select the autodifferentiation type\n",
+    "import OptimizationOptimisers: Optimization\n",
+    "adtype = Optimization.AutoZygote();\n",
+    "optf = Optimization.OptimizationFunction((x, p) -> myloss(x), adtype);\n",
+    "optprob = Optimization.OptimizationProblem(optf, pinit);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Select the training algorithm:\n",
+    "In the previous example we have used a classic gradient method like Adam:"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import OptimizationOptimisers: OptimiserChain, Adam\n",
+    "algo = OptimiserChain(Adam(1.0e-3));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "notice however that CNODEs can be trained with any Julia optimizer, including the ones from the `Optimization` package like LBFGS"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import OptimizationOptimJL: Optim\n",
+    "algo = Optim.LBFGS();"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "or even gradient-free methods like CMA-ES that we use for this example"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using OptimizationCMAEvolutionStrategy, Statistics\n",
+    "algo = CMAEvolutionStrategyOpt();"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Train the CNODE"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_example.jl\") # callback function\n",
+    "result_neuralode = Optimization.solve(optprob,\n",
+    "    algo;\n",
+    "    callback = callback,\n",
+    "    maxiters = 150);\n",
+    "pinit = result_neuralode.u;\n",
+    "θ = pinit;\n",
+    "optprob = Optimization.OptimizationProblem(optf, pinit);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "(Notice that the block above can be repeated to continue training, however don't do that with CMA-ES since it will restart from a random initial population)"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## III. Analyse the results\n",
+    "Let's compare the learned weights to the values that we expect"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using Plots, Plots.PlotMeasures\n",
+    "gs_w_u = θ.layer_3.layer_1.gs_weights;\n",
+    "gs_w_v = θ.layer_3.layer_2.gs_weights;\n",
+    "p1 = scatter(gs_w_u,\n",
+    "    label = \"learned\",\n",
+    "    title = \"Comparison NN_u coefficients\",\n",
+    "    xlabel = \"Index\",\n",
+    "    ylabel = \"Value\")\n",
+    "scatter!(p1, correct_w_u, label = \"correct\")\n",
+    "p2 = scatter(gs_w_v,\n",
+    "    label = \"learned\",\n",
+    "    title = \"Comparison NN_v coefficients\",\n",
+    "    xlabel = \"Index\",\n",
+    "    ylabel = \"Value\")\n",
+    "scatter!(p2, correct_w_v, label = \"correct\")\n",
+    "p = plot(p1, p2, layout = (2, 1))\n",
+    "display(p)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The learned weights look perfect, but let's check what happens if we use them to solve the GS model."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's solve the system, for two different set of parameters, with the trained CNODE and compare with the exact solution"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trange = (0.0, 500);\n",
+    "dt, saveat = (1, 5);\n",
+    "\n",
+    "# Exact solution\n",
+    "f_exact = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false)\n",
+    "θ_e, st_e = Lux.setup(rng, f_exact);\n",
+    "exact_CNODE = NeuralODE(f_exact,\n",
+    "    trange,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "exact_CNODE_solution = Array(exact_CNODE(GS_sim[:, 1:2, 1], θ_e, st_e)[1]);\n",
+    "u = reshape(exact_CNODE_solution[1:(grid_GS.Nu), :, :],\n",
+    "    grid_GS.nux,\n",
+    "    grid_GS.nuy,\n",
+    "    size(exact_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v = reshape(exact_CNODE_solution[(grid_GS.Nu + 1):end, :, :],\n",
+    "    grid_GS.nvx,\n",
+    "    grid_GS.nvy,\n",
+    "    size(exact_CNODE_solution, 2),\n",
+    "    :);\n",
+    "\n",
+    "# Trained solution\n",
+    "trained_CNODE = NeuralODE(f_closed_CNODE,\n",
+    "    trange,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "trained_CNODE_solution = Array(trained_CNODE(GS_sim[:, 1:3, 1], θ, st)[1]);\n",
+    "u_trained = reshape(trained_CNODE_solution[1:(grid_GS.Nu), :, :],\n",
+    "    grid_GS.nux,\n",
+    "    grid_GS.nuy,\n",
+    "    size(trained_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v_trained = reshape(trained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],\n",
+    "    grid_GS.nvx,\n",
+    "    grid_GS.nvy,\n",
+    "    size(trained_CNODE_solution, 2),\n",
+    "    :);\n",
+    "f_u = create_f_CNODE(F_u_open, G_v_open, grid_GS, NN_u, NN_v; is_closed = false)\n",
+    "θ_u, st_u = Lux.setup(rng, f_u);\n",
+    "\n",
+    "# Untrained solution\n",
+    "untrained_CNODE = NeuralODE(f_u,\n",
+    "    trange,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "untrained_CNODE_solution = Array(untrained_CNODE(GS_sim[:, 1:3, 1], θ_u, st_u)[1]);\n",
+    "u_untrained = reshape(untrained_CNODE_solution[1:(grid_GS.Nu), :, :],\n",
+    "    grid_GS.nux,\n",
+    "    grid_GS.nuy,\n",
+    "    size(untrained_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v_untrained = reshape(untrained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],\n",
+    "    grid_GS.nvx,\n",
+    "    grid_GS.nvy,\n",
+    "    size(untrained_CNODE_solution, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Plot the results"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "anim = Animation()\n",
+    "fig = plot(layout = (2, 6), size = (1200, 400))\n",
+    "@gif for i in 1:1:size(u_trained, 4)\n",
+    "    # First row: set of parameters 1\n",
+    "    p1 = heatmap(u[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Exact\")\n",
+    "    p2 = heatmap(v[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p3 = heatmap(u_untrained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Untrained\")\n",
+    "    p4 = heatmap(v_untrained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p5 = heatmap(u_trained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Trained\")\n",
+    "    p6 = heatmap(v_trained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "\n",
+    "    # Second row: set of parameters 2\n",
+    "    p7 = heatmap(u[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Exact\")\n",
+    "    p8 = heatmap(v[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p9 = heatmap(u_untrained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Untrained\")\n",
+    "    p10 = heatmap(v_untrained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p11 = heatmap(u_trained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Trained\")\n",
+    "    p12 = heatmap(v_trained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "\n",
+    "    fig = plot(p1,\n",
+    "        p2,\n",
+    "        p3,\n",
+    "        p4,\n",
+    "        p5,\n",
+    "        p6,\n",
+    "        p7,\n",
+    "        p8,\n",
+    "        p9,\n",
+    "        p10,\n",
+    "        p11,\n",
+    "        p12,\n",
+    "        layout = (2, 6),\n",
+    "        margin = 0mm)\n",
+    "    frame(anim, fig)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Save the generated .gif"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "if isdir(\"./plots\")\n",
+    "    gif(anim, \"./plots/02.01-trained_GS.gif\", fps = 10)\n",
+    "else\n",
+    "    gif(anim, \"examples/plots/02.01-trained_GS.gif\", fps = 10)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Notice that even with a posteriori loss of the order of 1e-7 still produces a different dynamics over time!"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "myloss(θ)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "\n",
+    "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
+   ],
+   "metadata": {}
+  }
+ ],
+ "nbformat_minor": 3,
+ "metadata": {
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.10.2"
+  },
+  "kernelspec": {
+   "name": "julia-1.10",
+   "display_name": "Julia 1.10.2",
+   "language": "julia"
+  }
+ },
+ "nbformat": 4
+}
diff --git a/examples/02.01-GrayScott.md b/examples/02.01-GrayScott.md
new file mode 100644
index 0000000..75b08b4
--- /dev/null
+++ b/examples/02.01-GrayScott.md
@@ -0,0 +1,521 @@
+```julia
+import CUDA
+ArrayType = CUDA.functional() ? CuArray : Array;
+```
+
+# Learning the Gray-Scott model: a priori fitting
+
+In the previous example [02.00-GrayScott.jl](02.00-GrayScott.jl) we have seen how to use the CNODE to solve the Gray-Scott model via an explicit method.
+Here we introduce the *Learning part* of the CNODE, and show how it can be used to close the CNODE. We are going to train the neural network via **a priori fitting** and in the next example [02.02-GrayScott](02.02-GrayScott.jl) we will show how to use a posteriori fitting.
+
+As a reminder, the GS model is defined from
+\begin{equation}\begin{cases} \frac{du}{dt} = D_u \Delta u - uv^2 + f(1-u)  \equiv F_u(u,v) \\ \frac{dv}{dt} = D_v \Delta v + uv^2 - (f+k)v  \equiv G_v(u,v)\end{cases} \end{equation}
+where $u(x,y,t):\mathbb{R}^2\times \mathbb{R}\rightarrow \mathbb{R}$ is the concentration of species 1, while $v(x,y,t)$ is the concentration of species two. This model reproduce the effect of the two species diffusing in their environment, and reacting together.
+This effect is captured by the ratios between $D_u$ and $D_v$ (diffusion coefficients) and $f$ and $k$ (reaction rates).
+
+In this example, we will first (I) use the exact GS model to gather some data
+then in the second part (II), we will train a generic CNODE to show that it can learn the GS model from the data.
+
+## I. Solving GS to collect data
+Definition of the grid
+
+```julia
+include("coupling_functions/functions_NODE.jl")
+dux = duy = dvx = dvy = 1.0
+nux = nuy = nvx = nvy = 64
+grid_GS = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);
+```
+
+Definition of the initial condition as a random perturbation over a constant background to add variety.
+Notice that this initial conditions are different from those of the previous example.
+
+```julia
+import Random
+function initial_condition(grid, U₀, V₀, ε_u, ε_v; nsimulations = 1)
+    u_init = U₀ .+ ε_u .* Random.randn(grid.nux, grid.nuy, nsimulations)
+    v_init = V₀ .+ ε_v .* Random.randn(grid.nvx, grid.nvy, nsimulations)
+    return u_init, v_init
+end
+U₀ = 0.5    # initial concentration of u
+V₀ = 0.25   # initial concentration of v
+ε_u = 0.05  # magnitude of the perturbation on u
+ε_v = 0.1   # magnitude of the perturbation on v
+u_initial, v_initial = initial_condition(grid_GS, U₀, V₀, ε_u, ε_v, nsimulations = 20);
+```
+
+We define the initial condition as a flattened concatenated array
+
+```julia
+uv0 = vcat(reshape(u_initial, grid_GS.Nu, :), reshape(v_initial, grid_GS.Nv, :));
+```
+
+These are the GS parameters (also used in example 02.01) that we will try to learn
+
+```julia
+D_u = 0.16
+D_v = 0.08
+f = 0.055
+k = 0.062;
+```
+
+Exact right hand sides (functions) of the GS model
+
+```julia
+include("coupling_functions/functions_FDderivatives.jl")
+F_u(u, v, grid) = D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)
+G_v(u, v, grid) = D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2 .- (f + k) .* v
+```
+
+Definition of the CNODE
+
+```julia
+import Lux
+f_CNODE = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false);
+rng = Random.seed!(1234);
+θ_0, st_0 = Lux.setup(rng, f_CNODE);
+```
+
+**Burnout run:** to discard the results of the initial conditions.
+In this case we need 2 burnouts: first one with a relatively large time step and then another one with a smaller time step. This allow us to discard the transient dynamics and to have a good initial condition for the data collection run.
+
+```julia
+import DifferentialEquations: Tsit5
+import DiffEqFlux: NeuralODE
+trange_burn = (0.0, 1.0)
+dt, saveat = (1e-2, 1)
+burnout_CNODE = NeuralODE(f_CNODE,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+burnout_CNODE_solution = Array(burnout_CNODE(uv0, θ_0, st_0)[1]);
+```
+
+Second burnout with a smaller timestep
+
+```julia
+trange_burn = (0.0, 500.0)
+dt, saveat = (1 / (4 * max(D_u, D_v)), 100)
+burnout_CNODE = NeuralODE(f_CNODE,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+burnout_CNODE_solution = Array(burnout_CNODE(burnout_CNODE_solution[:, :, end], θ_0, st_0)[1]);
+```
+
+Data collection run
+
+```julia
+uv0 = burnout_CNODE_solution[:, :, end];
+trange = (0.0, 2000.0);
+dt, saveat = (1 / (4 * max(D_u, D_v)), 1);
+GS_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat);
+GS_sim = Array(GS_CNODE(uv0, θ_0, st_0)[1]);
+```
+
+`GS_sim` contains the solutions of $u$ and $v$ for the specified `trange` and `nsimulations` initial conditions. If you explore `GS_sim` you will see that it has the shape `((nux * nuy) + (nvx * nvy), nsimulations, timesteps)`.
+`uv_data` is a reshaped version of `GS_sim` that has the shape `(nux * nuy + nvx * nvy, nsimulations * timesteps)`. This is the format that we will use to train the CNODE.
+
+```julia
+uv_data = reshape(GS_sim, size(GS_sim, 1), size(GS_sim, 2) * size(GS_sim, 3));
+```
+
+We define `FG_target` containing the right hand sides (i.e. $\frac{du}{dt} and \frac{dv}{dt}$) of each one of the samples. We will see later that for the training `FG_target` is used as the labels to do derivative fitting.
+
+```julia
+FG_target = Array(f_CNODE(uv_data, θ_0, st_0)[1]);
+```
+
+## II. Training a CNODE to learn the GS model via a priori training
+To learn the GS model, we will use the following CNODE
+\begin{equation}\begin{cases} \frac{du}{dt} = D_u \Delta u - uv^2 + \theta_{u,1} u +\theta_{u,2} v +\theta_{u,3}  \\ \frac{dv}{dt} = D_v \Delta v + uv^2 + \theta_{v,1} u +\theta_{v,2} v +\theta_{v,3} \end{cases} \end{equation}
+In this example the deterministic function contains the diffusion and the coupling terms, while the model has to learn the source and death terms.
+The deterministic functions of the two coupled equations are:
+
+```julia
+import Zygote
+F_u_open(u, v, grid) = Zygote.@ignore D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2;
+G_v_open(u, v, grid) = Zygote.@ignore D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2;
+```
+
+We tell Zygote to ignore this tree branch for the gradient propagation.
+
+For the trainable part, we define an abstract Lux layer
+
+```julia
+struct GSLayer{F} <: Lux.AbstractExplicitLayer
+    init_weight::F
+end
+```
+
+and its (outside) constructor
+
+```julia
+function GSLayer(; init_weight = Lux.zeros32)
+    #function GSLayer(; init_weight = glorot_uniform)
+    return GSLayer(init_weight)
+end
+```
+
+We also need to specify how to initialize its parameters and states.
+This custom layer does not have any hidden states (RNGs) that are modified.
+
+```julia
+function Lux.initialparameters(rng::Random.AbstractRNG, (; init_weight)::GSLayer)
+    (;
+        gs_weights = init_weight(rng, 3),)
+end
+Lux.initialstates(::Random.AbstractRNG, ::GSLayer) = (;)
+Lux.parameterlength((;)::GSLayer) = 3
+Lux.statelength(::GSLayer) = 0
+```
+
+We define how to pass inputs through `GSlayer`, assuming the following:
+- Input size: `(N, N, 2, nsample)`, where the two channels are $u$ and $v$.
+- Output size: `(N, N, nsample)` where we assumed monochannel output, so we dropped the channel dimension.
+
+This is what each layer does. Notice that the layer does not modify the state.
+
+```julia
+function ((;)::GSLayer)(x, params, state)
+    u = x[:, :, 1, :]
+    v = x[:, :, 2, :]
+    out = params.gs_weights[1] .* u .+ params.gs_weights[2] .* v .+ params.gs_weights[3]
+    out, state
+end
+```
+
+We create the trainable models. In this case is just a GS layer, but the model can be as complex as needed.
+
+```julia
+NN_u = GSLayer()
+NN_v = GSLayer()
+```
+
+We can now close the CNODE with the Neural Network
+
+```julia
+f_closed_CNODE = create_f_CNODE(F_u_open, G_v_open, grid_GS, NN_u, NN_v; is_closed = true)
+θ, st = Lux.setup(rng, f_closed_CNODE);
+print(θ)
+```
+
+Check that the closed CNODE can reproduce the GS model if the parameters are set to the correct values
+
+```julia
+import ComponentArrays
+correct_w_u = [-f, 0, f];
+correct_w_v = [0, -(f + k), 0];
+θ_correct = ComponentArrays.ComponentArray(θ);
+θ_correct.layer_3.layer_1.gs_weights = correct_w_u;
+θ_correct.layer_3.layer_2.gs_weights = correct_w_v;
+```
+
+Notice that they are the same within a tolerance of 1e-7
+
+```julia
+isapprox(f_closed_CNODE(GS_sim[:, 1, 1], θ_correct, st)[1],
+    f_CNODE(GS_sim[:, 1, 1], θ_0, st_0)[1],
+    atol = 1e-7,
+    rtol = 1e-7)
+```
+
+but now with a tolerance of 1e-8 this check returns `false`.
+
+```julia
+isapprox(f_closed_CNODE(GS_sim[:, 1, 1], θ_correct, st)[1],
+    f_CNODE(GS_sim[:, 1, 1], θ_0, st_0)[1],
+    atol = 1e-8,
+    rtol = 1e-8)
+```
+
+In a chaotic system like GS, this would be enough to produce different dynamics, so be careful about this
+
+If you have problems with training the model, you can cheat and start from the solution to check your implementation:
+```julia
+θ.layer_3.layer_1.gs_weights = correct_w_u
+θ.layer_3.layer_2.gs_weights = correct_w_v
+pinit = θ
+```
+
+### Design the loss function - a priori fitting
+For this example, we use *a priori* fitting. In this approach, the loss function is defined to minimize the difference between the derivatives of $\frac{du}{dt}$ and $\frac{dv}{dt}$ predicted by the model and calculated via explicit method `FG_target`.
+In practice, we use [Zygote](https://fluxml.ai/Zygote.jl/stable/) to compare the right hand side of the GS model with the right hand side of the CNODE, and we ask it to minimize the difference.
+
+```julia
+include("coupling_functions/functions_CNODE_loss.jl");
+myloss = create_randloss_derivative(uv_data,
+    FG_target,
+    f_closed_CNODE,
+    st;
+    nuse = 64,
+    λ = 0);
+```
+
+To initialize the training, we need some objects to monitor the procedure, and we trigger the first compilation.
+
+```julia
+lhist = Float32[];
+# Initialize and trigger the compilation of the model
+pinit = ComponentArrays.ComponentArray(θ);
+myloss(pinit);
+# [!] Check that the loss does not get type warnings, otherwise it will be slower
+```
+
+We transform the NeuralODE into an optimization problem
+
+```julia
+# Select the autodifferentiation type
+import OptimizationOptimisers: Optimization
+adtype = Optimization.AutoZygote();
+optf = Optimization.OptimizationFunction((x, p) -> myloss(x), adtype);
+optprob = Optimization.OptimizationProblem(optf, pinit);
+```
+
+Select the training algorithm:
+In the previous example we have used a classic gradient method like Adam:
+
+```julia
+import OptimizationOptimisers: OptimiserChain, Adam
+algo = OptimiserChain(Adam(1.0e-3));
+```
+
+notice however that CNODEs can be trained with any Julia optimizer, including the ones from the `Optimization` package like LBFGS
+
+```julia
+import OptimizationOptimJL: Optim
+algo = Optim.LBFGS();
+```
+
+or even gradient-free methods like CMA-ES that we use for this example
+
+```julia
+using OptimizationCMAEvolutionStrategy, Statistics
+algo = CMAEvolutionStrategyOpt();
+```
+
+### Train the CNODE
+
+```julia
+include("coupling_functions/functions_example.jl") # callback function
+result_neuralode = Optimization.solve(optprob,
+    algo;
+    callback = callback,
+    maxiters = 150);
+pinit = result_neuralode.u;
+θ = pinit;
+optprob = Optimization.OptimizationProblem(optf, pinit);
+```
+
+(Notice that the block above can be repeated to continue training, however don't do that with CMA-ES since it will restart from a random initial population)
+
+## III. Analyse the results
+Let's compare the learned weights to the values that we expect
+
+```julia
+using Plots, Plots.PlotMeasures
+gs_w_u = θ.layer_3.layer_1.gs_weights;
+gs_w_v = θ.layer_3.layer_2.gs_weights;
+p1 = scatter(gs_w_u,
+    label = "learned",
+    title = "Comparison NN_u coefficients",
+    xlabel = "Index",
+    ylabel = "Value")
+scatter!(p1, correct_w_u, label = "correct")
+p2 = scatter(gs_w_v,
+    label = "learned",
+    title = "Comparison NN_v coefficients",
+    xlabel = "Index",
+    ylabel = "Value")
+scatter!(p2, correct_w_v, label = "correct")
+p = plot(p1, p2, layout = (2, 1))
+display(p)
+```
+
+The learned weights look perfect, but let's check what happens if we use them to solve the GS model.
+
+Let's solve the system, for two different set of parameters, with the trained CNODE and compare with the exact solution
+
+```julia
+trange = (0.0, 500);
+dt, saveat = (1, 5);
+
+# Exact solution
+f_exact = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false)
+θ_e, st_e = Lux.setup(rng, f_exact);
+exact_CNODE = NeuralODE(f_exact,
+    trange,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+exact_CNODE_solution = Array(exact_CNODE(GS_sim[:, 1:2, 1], θ_e, st_e)[1]);
+u = reshape(exact_CNODE_solution[1:(grid_GS.Nu), :, :],
+    grid_GS.nux,
+    grid_GS.nuy,
+    size(exact_CNODE_solution, 2),
+    :);
+v = reshape(exact_CNODE_solution[(grid_GS.Nu + 1):end, :, :],
+    grid_GS.nvx,
+    grid_GS.nvy,
+    size(exact_CNODE_solution, 2),
+    :);
+
+# Trained solution
+trained_CNODE = NeuralODE(f_closed_CNODE,
+    trange,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+trained_CNODE_solution = Array(trained_CNODE(GS_sim[:, 1:3, 1], θ, st)[1]);
+u_trained = reshape(trained_CNODE_solution[1:(grid_GS.Nu), :, :],
+    grid_GS.nux,
+    grid_GS.nuy,
+    size(trained_CNODE_solution, 2),
+    :);
+v_trained = reshape(trained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],
+    grid_GS.nvx,
+    grid_GS.nvy,
+    size(trained_CNODE_solution, 2),
+    :);
+f_u = create_f_CNODE(F_u_open, G_v_open, grid_GS, NN_u, NN_v; is_closed = false)
+θ_u, st_u = Lux.setup(rng, f_u);
+
+# Untrained solution
+untrained_CNODE = NeuralODE(f_u,
+    trange,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+untrained_CNODE_solution = Array(untrained_CNODE(GS_sim[:, 1:3, 1], θ_u, st_u)[1]);
+u_untrained = reshape(untrained_CNODE_solution[1:(grid_GS.Nu), :, :],
+    grid_GS.nux,
+    grid_GS.nuy,
+    size(untrained_CNODE_solution, 2),
+    :);
+v_untrained = reshape(untrained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],
+    grid_GS.nvx,
+    grid_GS.nvy,
+    size(untrained_CNODE_solution, 2),
+    :);
+```
+
+### Plot the results
+
+```julia
+anim = Animation()
+fig = plot(layout = (2, 6), size = (1200, 400))
+@gif for i in 1:1:size(u_trained, 4)
+    # First row: set of parameters 1
+    p1 = heatmap(u[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Exact")
+    p2 = heatmap(v[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p3 = heatmap(u_untrained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Untrained")
+    p4 = heatmap(v_untrained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p5 = heatmap(u_trained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Trained")
+    p6 = heatmap(v_trained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+
+    # Second row: set of parameters 2
+    p7 = heatmap(u[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Exact")
+    p8 = heatmap(v[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p9 = heatmap(u_untrained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Untrained")
+    p10 = heatmap(v_untrained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p11 = heatmap(u_trained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Trained")
+    p12 = heatmap(v_trained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+
+    fig = plot(p1,
+        p2,
+        p3,
+        p4,
+        p5,
+        p6,
+        p7,
+        p8,
+        p9,
+        p10,
+        p11,
+        p12,
+        layout = (2, 6),
+        margin = 0mm)
+    frame(anim, fig)
+end
+```
+
+Save the generated .gif
+
+```julia
+if isdir("./plots")
+    gif(anim, "./plots/02.01-trained_GS.gif", fps = 10)
+else
+    gif(anim, "examples/plots/02.01-trained_GS.gif", fps = 10)
+end
+```
+
+Notice that even with a posteriori loss of the order of 1e-7 still produces a different dynamics over time!
+
+```julia
+myloss(θ)
+```
+
+---
+
+*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
+
diff --git a/examples/02.02-GrayScott.ipynb b/examples/02.02-GrayScott.ipynb
new file mode 100644
index 0000000..e30af2d
--- /dev/null
+++ b/examples/02.02-GrayScott.ipynb
@@ -0,0 +1,884 @@
+{
+ "cells": [
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import CUDA\n",
+    "ArrayType = CUDA.functional() ? CuArray : Array;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Learning the Gray-Scott model: a posteriori fitting"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In this example, we will learn how to approximate a closure to the Gray-Scott model with a Neural Network trained via a posteriori fitting, using a [multishooting](https://docs.sciml.ai/DiffEqFlux/dev/examples/multiple_shooting/) approach."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "As a reminder, the GS model is defined from\n",
+    "\\begin{equation}\\begin{cases} \\frac{du}{dt} = D_u \\Delta u - uv^2 + f(1-u)  \\equiv F_u(u,v) \\\\ \\frac{dv}{dt} = D_v \\Delta v + uv^2 - (f+k)v  \\equiv G_v(u,v)\\end{cases} \\end{equation}\n",
+    "where $u(x,y,t):\\mathbb{R}^2\\times \\mathbb{R}\\rightarrow \\mathbb{R}$ is the concentration of species 1, while $v(x,y,t)$ is the concentration of species two. This model reproduce the effect of the two species diffusing in their environment, and reacting together.\n",
+    "This effect is captured by the ratios between $D_u$ and $D_v$ (the diffusion coefficients) and $f$ and $k$ (the reaction rates)."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "As in the repvious example, we will first (I) use the exact GS model to gather some data and then in the second part (II) we will train a neural network to approximate the GS model using a posteriori fitting."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## I. Solving GS to collect data\n",
+    "Definition of the grid"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_NODE.jl\")\n",
+    "dux = duy = dvx = dvy = 1.0\n",
+    "nux = nuy = nvx = nvy = 64\n",
+    "grid_GS = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Define the initial condition as a random perturbation over a constant background to add variety. Notice that in this case we are generating only 2 samples (i.e. `nsimulations=2`). This is because for the *a posteriori fitting* we are using a fine sampling."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import Random\n",
+    "function initial_condition(grid, U₀, V₀, ε_u, ε_v; nsimulations = 1)\n",
+    "    u_init = U₀ .+ ε_u .* Random.randn(grid.nux, grid.nuy, nsimulations)\n",
+    "    v_init = V₀ .+ ε_v .* Random.randn(grid.nvx, grid.nvy, nsimulations)\n",
+    "    return u_init, v_init\n",
+    "end\n",
+    "U₀ = 0.5    # initial concentration of u\n",
+    "V₀ = 0.25   # initial concentration of v\n",
+    "ε_u = 0.05  # magnitude of the perturbation on u\n",
+    "ε_v = 0.1   # magnitude of the perturbation on v\n",
+    "u_initial, v_initial = initial_condition(grid_GS, U₀, V₀, ε_u, ε_v, nsimulations = 2);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "$u$ and $v$ are concatenated in a flattended array"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = vcat(reshape(u_initial, grid_GS.Nu, :), reshape(v_initial, grid_GS.Nv, :));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "These are the GS parameters (also used in examples 02.01 and 02.02) that we will try to learn."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "D_u = 0.16\n",
+    "D_v = 0.08\n",
+    "f = 0.055\n",
+    "k = 0.062;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Exact right hand sides of the GS model:"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_FDderivatives.jl\")\n",
+    "F_u(u, v, grid) = D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)\n",
+    "G_v(u, v, grid) = D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2 .- (f + k) .* v"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "CNODE definition"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import Lux\n",
+    "f_CNODE = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false);\n",
+    "rng = Random.seed!(1234);\n",
+    "θ_0, st_0 = Lux.setup(rng, f_CNODE);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "**Burnout run**"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import DifferentialEquations: Tsit5\n",
+    "import DiffEqFlux: NeuralODE\n",
+    "trange_burn = (0.0, 1.0)\n",
+    "dt, saveat = (1e-2, 1)\n",
+    "burnout_CNODE = NeuralODE(f_CNODE,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "burnout_CNODE_solution = Array(burnout_CNODE(uv0, θ_0, st_0)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Second burnout with a larger timestep"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trange_burn = (0.0, 800.0)\n",
+    "dt, saveat = (1 / (4 * max(D_u, D_v)), 100)\n",
+    "burnout_CNODE = NeuralODE(f_CNODE,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "burnout_CNODE_solution = Array(burnout_CNODE(burnout_CNODE_solution[:, :, end], θ_0, st_0)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Data collection run"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = burnout_CNODE_solution[:, :, end];"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "For the a-posteriori fitting, we use a fine sampling for two reasons:\n",
+    "1. use a handlable amount of data points\n",
+    "2. prevent instabilities while training\n",
+    "However, this means that the simulation can not be long."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dt, saveat = (1 / (4 * max(D_u, D_v)), 0.001)\n",
+    "trange = (0.0, 50.0)\n",
+    "GS_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat);\n",
+    "GS_sim = Array(GS_CNODE(uv0, θ_0, st_0)[1])"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## II. Training a CNODE to learn the GS model via a posteriori training\n",
+    "To learn the GS model, we will use the following CNODE\n",
+    "\\begin{equation}\\begin{cases} \\frac{du}{dt} = D_u \\Delta u + \\theta_{u,1} uv^2 +\\theta_{u,2} v^2u + \\theta_{u,3} u +\\theta_{u,4} v +\\theta_{u,5}  \\\\ \\frac{dv}{dt} = D_v \\Delta v + \\theta_{v,1} uv^2 + \\theta_{v,2} v^2u +\\theta_{v,3} u +\\theta_{v,4} v +\\theta_{v,5} \\end{cases} \\end{equation}\n",
+    "In this example the deterministic function contains the diffusion and the coupling terms, while the model has to learn the source and death terms.\n",
+    "Then the deterministic functions of the two coupled equations are"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import Zygote\n",
+    "F_u_open(u, v, grid) = Zygote.@ignore D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2\n",
+    "G_v_open(u, v, grid) = Zygote.@ignore D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We tell Zygote to ignore this tree branch for the gradient propagation."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Definition of Neural functions\n",
+    "In this case we define different architectures for $u$: $NN_u$ and $v$: $NN_v$, to make the training easier."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "struct GSLayer_u{F} <: Lux.AbstractExplicitLayer\n",
+    "    init_weight::F\n",
+    "end\n",
+    "function GSLayer_u(; init_weight = Lux.zeros32)\n",
+    "    return GSLayer_u(init_weight)\n",
+    "end\n",
+    "struct GSLayer_v{F} <: Lux.AbstractExplicitLayer\n",
+    "    init_weight::F\n",
+    "end\n",
+    "function GSLayer_v(; init_weight = Lux.zeros32)\n",
+    "    return GSLayer_v(init_weight)\n",
+    "end\n",
+    "\n",
+    "function Lux.initialparameters(rng::Random.AbstractRNG, (; init_weight)::GSLayer_u)\n",
+    "    (;\n",
+    "        gs_weights = init_weight(rng, 2),)\n",
+    "end\n",
+    "function Lux.initialparameters(rng::Random.AbstractRNG, (; init_weight)::GSLayer_v)\n",
+    "    (;\n",
+    "        gs_weights = init_weight(rng, 1),)\n",
+    "end\n",
+    "Lux.initialstates(::Random.AbstractRNG, ::GSLayer_u) = (;)\n",
+    "Lux.initialstates(::Random.AbstractRNG, ::GSLayer_v) = (;)\n",
+    "Lux.parameterlength((;)::GSLayer_u) = 2\n",
+    "Lux.parameterlength((;)::GSLayer_v) = 1\n",
+    "Lux.statelength(::GSLayer_u) = 0\n",
+    "Lux.statelength(::GSLayer_v) = 0\n",
+    "function ((;)::GSLayer_u)(x, params, state)\n",
+    "    u = x[:, :, 1, :]\n",
+    "    out = params.gs_weights[1] .* u .+ params.gs_weights[2]\n",
+    "    out, state\n",
+    "end\n",
+    "function ((;)::GSLayer_v)(x, params, state)\n",
+    "    v = x[:, :, 2, :]\n",
+    "    out = params.gs_weights[1] .* v\n",
+    "    out, state\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The trainable parts are thus:"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "NN_u = GSLayer_u()\n",
+    "NN_v = GSLayer_v()"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Close the CNODE with the Neural Network"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_closed_CNODE = create_f_CNODE(F_u_open, G_v_open, grid_GS, NN_u, NN_v; is_closed = true)\n",
+    "θ, st = Lux.setup(rng, f_closed_CNODE);\n",
+    "import ComponentArrays\n",
+    "θ = ComponentArrays.ComponentArray(θ)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Design the loss function - a posteriori fitting\n",
+    "In *a posteriori* fitting, we rely on a differentiable solver to propagate the gradient through the solution of the NODE. In this case, we use the *multishooting a posteriori* fitting [(MulDtO)](https://docs.sciml.ai/DiffEqFlux/dev/examples/multiple_shooting/), where we use `Zygote` to compare `nintervals` of length `nunroll` to get the gradient. Notice that this method is differentiating through the solution of the NODE!"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "nunroll = 10\n",
+    "nintervals = 5\n",
+    "noverlaps = 1\n",
+    "nsamples = 1;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Since we want to control the time step and the total length of the solutions that we have to compute at each iteration, we  define an auxiliary NODE that will be used for training.\n",
+    "In particular, we can use smaller time steps for the training, but have to sample at the same rate as the data.\n",
+    "Also, it is important to solve for only the time interval thas is needed at each training step (corresponding to `nunroll` steps)\n",
+    "*Note:* The GS model is stiff, so we need to use a small time step to solve it. In previous versions we had two different CNODEs, the second one would be used in case the solver would be unstable. In this version, we stick to a smaller time step that those used in the previous examples to avoid instabilities."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dt_train = 0.001;\n",
+    "saveat_train = saveat\n",
+    "t_train_range = (0.0, saveat_train * nunroll)\n",
+    "training_CNODE = NeuralODE(f_closed_CNODE,\n",
+    "    t_train_range,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt_train,\n",
+    "    saveat = saveat_train);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Create the loss"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_CNODE_loss.jl\");\n",
+    "myloss = create_randloss_MulDtO(GS_sim,\n",
+    "    nunroll = nunroll,\n",
+    "    noverlaps = noverlaps,\n",
+    "    nintervals = nintervals,\n",
+    "    nsamples = nsamples,\n",
+    "    λ_c = 1e2,\n",
+    "    λ_l1 = 1e-1);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "To initialize the training, we need some objects to monitor the procedure, and we trigger the first compilation."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "lhist = Float32[];"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Initialize and trigger the compilation of the model"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "pinit = ComponentArrays.ComponentArray(θ)\n",
+    "myloss(pinit)  # trigger compilation"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "⚠️ Check that the loss does not get type warnings, otherwise it will be slower"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Autodifferentiation type"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import OptimizationOptimisers: Optimization\n",
+    "using Statistics\n",
+    "adtype = Optimization.AutoZygote();"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We transform the NeuralODE into an optimization problem"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "optf = Optimization.OptimizationFunction((x, p) -> myloss(x), adtype);\n",
+    "optprob = Optimization.OptimizationProblem(optf, pinit);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Training algorithm\n",
+    "In this example we use a quasi-newtonian method"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import OptimizationOptimJL: Optim\n",
+    "import LineSearches\n",
+    "algo = Optim.LBFGS(linesearch = LineSearches.BackTracking(order = 3));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Train the CNODEs"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_example.jl\")\n",
+    "result_neuralode = Optimization.solve(optprob,\n",
+    "    algo;\n",
+    "    callback = callback,\n",
+    "    maxiters = 50);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We may get `**Warning:** Instability detected. Aborting` for the first time steps of the training. This is due to the stiff nature of the GS model as explained earlier. The training will continue after the first few steps."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "pinit = result_neuralode.u;\n",
+    "θ = pinit\n",
+    "optprob = Optimization.OptimizationProblem(optf, pinit);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## III. Analyse the results"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Comparison: learned weights vs (expected) values of the parameters"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using Plots, Plots.PlotMeasures\n",
+    "correct_w_u = [-f, f]\n",
+    "correct_w_v = [-(f + k)]\n",
+    "gs_w_u = θ.layer_3.layer_1.gs_weights\n",
+    "gs_w_v = θ.layer_3.layer_2.gs_weights\n",
+    "p1 = scatter(gs_w_u,\n",
+    "    label = \"learned\",\n",
+    "    title = \"Comparison NN_u coefficients\",\n",
+    "    xlabel = \"Index\",\n",
+    "    ylabel = \"Value\")\n",
+    "scatter!(p1, correct_w_u, label = \"correct\")\n",
+    "p2 = scatter(gs_w_v,\n",
+    "    label = \"learned\",\n",
+    "    title = \"Comparison NN_v coefficients\",\n",
+    "    xlabel = \"Index\",\n",
+    "    ylabel = \"Value\")\n",
+    "scatter!(p2, correct_w_v, label = \"correct\")\n",
+    "p = plot(p1, p2, layout = (2, 1))\n",
+    "display(p)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The learned weights look perfect, let's check what happens if we use them to solve the GS model."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Comparison: CNODE vs exact solutions"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trange = (0.0, 600)\n",
+    "dt, saveat = (1, 5)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Exact solution"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_exact = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false)\n",
+    "θ_e, st_e = Lux.setup(rng, f_exact);\n",
+    "exact_CNODE = NeuralODE(f_exact,\n",
+    "    trange,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "exact_CNODE_solution = Array(exact_CNODE(GS_sim[:, 1:2, 1], θ_e, st_e)[1]);\n",
+    "u = reshape(exact_CNODE_solution[1:(grid_GS.Nu), :, :],\n",
+    "    grid_GS.nux,\n",
+    "    grid_GS.nuy,\n",
+    "    size(exact_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v = reshape(exact_CNODE_solution[(grid_GS.Nu + 1):end, :, :],\n",
+    "    grid_GS.nvx,\n",
+    "    grid_GS.nvy,\n",
+    "    size(exact_CNODE_solution, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Trained solution"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trained_CNODE = NeuralODE(f_closed_CNODE,\n",
+    "    trange,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "trained_CNODE_solution = Array(trained_CNODE(GS_sim[:, 1:2, 1], θ, st)[1]);\n",
+    "u_trained = reshape(trained_CNODE_solution[1:(grid_GS.Nu), :, :],\n",
+    "    grid_GS.nux,\n",
+    "    grid_GS.nuy,\n",
+    "    size(trained_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v_trained = reshape(trained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],\n",
+    "    grid_GS.nvx,\n",
+    "    grid_GS.nvy,\n",
+    "    size(trained_CNODE_solution, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Untrained solution"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_u = create_f_CNODE(F_u_open, G_v_open, grid_GS, NN_u, NN_v; is_closed = false)\n",
+    "θ_u, st_u = Lux.setup(rng, f_u);\n",
+    "untrained_CNODE = NeuralODE(f_u,\n",
+    "    trange,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "untrained_CNODE_solution = Array(untrained_CNODE(GS_sim[:, 1:2, 1], θ_u, st_u)[1]);\n",
+    "u_untrained = reshape(untrained_CNODE_solution[1:(grid_GS.Nu), :, :],\n",
+    "    grid_GS.nux,\n",
+    "    grid_GS.nuy,\n",
+    "    size(untrained_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v_untrained = reshape(untrained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],\n",
+    "    grid_GS.nvx,\n",
+    "    grid_GS.nvy,\n",
+    "    size(untrained_CNODE_solution, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Plot the results"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "anim = Animation()\n",
+    "fig = plot(layout = (2, 6), size = (1200, 400))\n",
+    "@gif for i in 1:1:size(u_trained, 4)\n",
+    "    # First row: set of parameters 1\n",
+    "    p1 = heatmap(u[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Exact\")\n",
+    "    p2 = heatmap(v[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p3 = heatmap(u_untrained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Untrained\")\n",
+    "    p4 = heatmap(v_untrained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p5 = heatmap(u_trained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Trained\")\n",
+    "    p6 = heatmap(v_trained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "\n",
+    "    # Second row: set of parameters 2\n",
+    "    p7 = heatmap(u[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Exact\")\n",
+    "    p8 = heatmap(v[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p9 = heatmap(u_untrained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Untrained\")\n",
+    "    p10 = heatmap(v_untrained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p11 = heatmap(u_trained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Trained\")\n",
+    "    p12 = heatmap(v_trained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "\n",
+    "    fig = plot(p1,\n",
+    "        p2,\n",
+    "        p3,\n",
+    "        p4,\n",
+    "        p5,\n",
+    "        p6,\n",
+    "        p7,\n",
+    "        p8,\n",
+    "        p9,\n",
+    "        p10,\n",
+    "        p11,\n",
+    "        p12,\n",
+    "        layout = (2, 6),\n",
+    "        margin = 0mm)\n",
+    "    frame(anim, fig)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Save the generated .gif"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "if isdir(\"./plots\")\n",
+    "    gif(anim, \"./plots/02.02-trained_GS.gif\", fps = 10)\n",
+    "else\n",
+    "    gif(anim, \"examples/plots/02.02-trained_GS.gif\", fps = 10)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "\n",
+    "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
+   ],
+   "metadata": {}
+  }
+ ],
+ "nbformat_minor": 3,
+ "metadata": {
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.10.2"
+  },
+  "kernelspec": {
+   "name": "julia-1.10",
+   "display_name": "Julia 1.10.2",
+   "language": "julia"
+  }
+ },
+ "nbformat": 4
+}
diff --git a/examples/02.02-GrayScott.md b/examples/02.02-GrayScott.md
new file mode 100644
index 0000000..e66da23
--- /dev/null
+++ b/examples/02.02-GrayScott.md
@@ -0,0 +1,505 @@
+```julia
+import CUDA
+ArrayType = CUDA.functional() ? CuArray : Array;
+```
+
+# Learning the Gray-Scott model: a posteriori fitting
+
+In this example, we will learn how to approximate a closure to the Gray-Scott model with a Neural Network trained via a posteriori fitting, using a [multishooting](https://docs.sciml.ai/DiffEqFlux/dev/examples/multiple_shooting/) approach.
+
+As a reminder, the GS model is defined from
+\begin{equation}\begin{cases} \frac{du}{dt} = D_u \Delta u - uv^2 + f(1-u)  \equiv F_u(u,v) \\ \frac{dv}{dt} = D_v \Delta v + uv^2 - (f+k)v  \equiv G_v(u,v)\end{cases} \end{equation}
+where $u(x,y,t):\mathbb{R}^2\times \mathbb{R}\rightarrow \mathbb{R}$ is the concentration of species 1, while $v(x,y,t)$ is the concentration of species two. This model reproduce the effect of the two species diffusing in their environment, and reacting together.
+This effect is captured by the ratios between $D_u$ and $D_v$ (the diffusion coefficients) and $f$ and $k$ (the reaction rates).
+
+As in the repvious example, we will first (I) use the exact GS model to gather some data and then in the second part (II) we will train a neural network to approximate the GS model using a posteriori fitting.
+
+## I. Solving GS to collect data
+Definition of the grid
+
+```julia
+include("coupling_functions/functions_NODE.jl")
+dux = duy = dvx = dvy = 1.0
+nux = nuy = nvx = nvy = 64
+grid_GS = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);
+```
+
+Define the initial condition as a random perturbation over a constant background to add variety. Notice that in this case we are generating only 2 samples (i.e. `nsimulations=2`). This is because for the *a posteriori fitting* we are using a fine sampling.
+
+```julia
+import Random
+function initial_condition(grid, U₀, V₀, ε_u, ε_v; nsimulations = 1)
+    u_init = U₀ .+ ε_u .* Random.randn(grid.nux, grid.nuy, nsimulations)
+    v_init = V₀ .+ ε_v .* Random.randn(grid.nvx, grid.nvy, nsimulations)
+    return u_init, v_init
+end
+U₀ = 0.5    # initial concentration of u
+V₀ = 0.25   # initial concentration of v
+ε_u = 0.05  # magnitude of the perturbation on u
+ε_v = 0.1   # magnitude of the perturbation on v
+u_initial, v_initial = initial_condition(grid_GS, U₀, V₀, ε_u, ε_v, nsimulations = 2);
+```
+
+$u$ and $v$ are concatenated in a flattended array
+
+```julia
+uv0 = vcat(reshape(u_initial, grid_GS.Nu, :), reshape(v_initial, grid_GS.Nv, :));
+```
+
+These are the GS parameters (also used in examples 02.01 and 02.02) that we will try to learn.
+
+```julia
+D_u = 0.16
+D_v = 0.08
+f = 0.055
+k = 0.062;
+```
+
+Exact right hand sides of the GS model:
+
+```julia
+include("coupling_functions/functions_FDderivatives.jl")
+F_u(u, v, grid) = D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)
+G_v(u, v, grid) = D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2 .- (f + k) .* v
+```
+
+CNODE definition
+
+```julia
+import Lux
+f_CNODE = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false);
+rng = Random.seed!(1234);
+θ_0, st_0 = Lux.setup(rng, f_CNODE);
+```
+
+**Burnout run**
+
+```julia
+import DifferentialEquations: Tsit5
+import DiffEqFlux: NeuralODE
+trange_burn = (0.0, 1.0)
+dt, saveat = (1e-2, 1)
+burnout_CNODE = NeuralODE(f_CNODE,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+burnout_CNODE_solution = Array(burnout_CNODE(uv0, θ_0, st_0)[1]);
+```
+
+Second burnout with a larger timestep
+
+```julia
+trange_burn = (0.0, 800.0)
+dt, saveat = (1 / (4 * max(D_u, D_v)), 100)
+burnout_CNODE = NeuralODE(f_CNODE,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+burnout_CNODE_solution = Array(burnout_CNODE(burnout_CNODE_solution[:, :, end], θ_0, st_0)[1]);
+```
+
+Data collection run
+
+```julia
+uv0 = burnout_CNODE_solution[:, :, end];
+```
+
+For the a-posteriori fitting, we use a fine sampling for two reasons:
+1. use a handlable amount of data points
+2. prevent instabilities while training
+However, this means that the simulation can not be long.
+
+```julia
+dt, saveat = (1 / (4 * max(D_u, D_v)), 0.001)
+trange = (0.0, 50.0)
+GS_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat);
+GS_sim = Array(GS_CNODE(uv0, θ_0, st_0)[1])
+```
+
+## II. Training a CNODE to learn the GS model via a posteriori training
+To learn the GS model, we will use the following CNODE
+\begin{equation}\begin{cases} \frac{du}{dt} = D_u \Delta u + \theta_{u,1} uv^2 +\theta_{u,2} v^2u + \theta_{u,3} u +\theta_{u,4} v +\theta_{u,5}  \\ \frac{dv}{dt} = D_v \Delta v + \theta_{v,1} uv^2 + \theta_{v,2} v^2u +\theta_{v,3} u +\theta_{v,4} v +\theta_{v,5} \end{cases} \end{equation}
+In this example the deterministic function contains the diffusion and the coupling terms, while the model has to learn the source and death terms.
+Then the deterministic functions of the two coupled equations are
+
+```julia
+import Zygote
+F_u_open(u, v, grid) = Zygote.@ignore D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2
+G_v_open(u, v, grid) = Zygote.@ignore D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2
+```
+
+We tell Zygote to ignore this tree branch for the gradient propagation.
+
+### Definition of Neural functions
+In this case we define different architectures for $u$: $NN_u$ and $v$: $NN_v$, to make the training easier.
+
+```julia
+struct GSLayer_u{F} <: Lux.AbstractExplicitLayer
+    init_weight::F
+end
+function GSLayer_u(; init_weight = Lux.zeros32)
+    return GSLayer_u(init_weight)
+end
+struct GSLayer_v{F} <: Lux.AbstractExplicitLayer
+    init_weight::F
+end
+function GSLayer_v(; init_weight = Lux.zeros32)
+    return GSLayer_v(init_weight)
+end
+
+function Lux.initialparameters(rng::Random.AbstractRNG, (; init_weight)::GSLayer_u)
+    (;
+        gs_weights = init_weight(rng, 2),)
+end
+function Lux.initialparameters(rng::Random.AbstractRNG, (; init_weight)::GSLayer_v)
+    (;
+        gs_weights = init_weight(rng, 1),)
+end
+Lux.initialstates(::Random.AbstractRNG, ::GSLayer_u) = (;)
+Lux.initialstates(::Random.AbstractRNG, ::GSLayer_v) = (;)
+Lux.parameterlength((;)::GSLayer_u) = 2
+Lux.parameterlength((;)::GSLayer_v) = 1
+Lux.statelength(::GSLayer_u) = 0
+Lux.statelength(::GSLayer_v) = 0
+function ((;)::GSLayer_u)(x, params, state)
+    u = x[:, :, 1, :]
+    out = params.gs_weights[1] .* u .+ params.gs_weights[2]
+    out, state
+end
+function ((;)::GSLayer_v)(x, params, state)
+    v = x[:, :, 2, :]
+    out = params.gs_weights[1] .* v
+    out, state
+end
+```
+
+The trainable parts are thus:
+
+```julia
+NN_u = GSLayer_u()
+NN_v = GSLayer_v()
+```
+
+Close the CNODE with the Neural Network
+
+```julia
+f_closed_CNODE = create_f_CNODE(F_u_open, G_v_open, grid_GS, NN_u, NN_v; is_closed = true)
+θ, st = Lux.setup(rng, f_closed_CNODE);
+import ComponentArrays
+θ = ComponentArrays.ComponentArray(θ)
+```
+
+### Design the loss function - a posteriori fitting
+In *a posteriori* fitting, we rely on a differentiable solver to propagate the gradient through the solution of the NODE. In this case, we use the *multishooting a posteriori* fitting [(MulDtO)](https://docs.sciml.ai/DiffEqFlux/dev/examples/multiple_shooting/), where we use `Zygote` to compare `nintervals` of length `nunroll` to get the gradient. Notice that this method is differentiating through the solution of the NODE!
+
+```julia
+nunroll = 10
+nintervals = 5
+noverlaps = 1
+nsamples = 1;
+```
+
+Since we want to control the time step and the total length of the solutions that we have to compute at each iteration, we  define an auxiliary NODE that will be used for training.
+In particular, we can use smaller time steps for the training, but have to sample at the same rate as the data.
+Also, it is important to solve for only the time interval thas is needed at each training step (corresponding to `nunroll` steps)
+*Note:* The GS model is stiff, so we need to use a small time step to solve it. In previous versions we had two different CNODEs, the second one would be used in case the solver would be unstable. In this version, we stick to a smaller time step that those used in the previous examples to avoid instabilities.
+
+```julia
+dt_train = 0.001;
+saveat_train = saveat
+t_train_range = (0.0, saveat_train * nunroll)
+training_CNODE = NeuralODE(f_closed_CNODE,
+    t_train_range,
+    Tsit5(),
+    adaptive = false,
+    dt = dt_train,
+    saveat = saveat_train);
+```
+
+Create the loss
+
+```julia
+include("coupling_functions/functions_CNODE_loss.jl");
+myloss = create_randloss_MulDtO(GS_sim,
+    nunroll = nunroll,
+    noverlaps = noverlaps,
+    nintervals = nintervals,
+    nsamples = nsamples,
+    λ_c = 1e2,
+    λ_l1 = 1e-1);
+```
+
+To initialize the training, we need some objects to monitor the procedure, and we trigger the first compilation.
+
+```julia
+lhist = Float32[];
+```
+
+Initialize and trigger the compilation of the model
+
+```julia
+pinit = ComponentArrays.ComponentArray(θ)
+myloss(pinit)  # trigger compilation
+```
+
+⚠️ Check that the loss does not get type warnings, otherwise it will be slower
+
+### Autodifferentiation type
+
+```julia
+import OptimizationOptimisers: Optimization
+using Statistics
+adtype = Optimization.AutoZygote();
+```
+
+We transform the NeuralODE into an optimization problem
+
+```julia
+optf = Optimization.OptimizationFunction((x, p) -> myloss(x), adtype);
+optprob = Optimization.OptimizationProblem(optf, pinit);
+```
+
+### Training algorithm
+In this example we use a quasi-newtonian method
+
+```julia
+import OptimizationOptimJL: Optim
+import LineSearches
+algo = Optim.LBFGS(linesearch = LineSearches.BackTracking(order = 3));
+```
+
+### Train the CNODEs
+
+```julia
+include("coupling_functions/functions_example.jl")
+result_neuralode = Optimization.solve(optprob,
+    algo;
+    callback = callback,
+    maxiters = 50);
+```
+
+We may get `**Warning:** Instability detected. Aborting` for the first time steps of the training. This is due to the stiff nature of the GS model as explained earlier. The training will continue after the first few steps.
+
+```julia
+pinit = result_neuralode.u;
+θ = pinit
+optprob = Optimization.OptimizationProblem(optf, pinit);
+```
+
+## III. Analyse the results
+
+### Comparison: learned weights vs (expected) values of the parameters
+
+```julia
+using Plots, Plots.PlotMeasures
+correct_w_u = [-f, f]
+correct_w_v = [-(f + k)]
+gs_w_u = θ.layer_3.layer_1.gs_weights
+gs_w_v = θ.layer_3.layer_2.gs_weights
+p1 = scatter(gs_w_u,
+    label = "learned",
+    title = "Comparison NN_u coefficients",
+    xlabel = "Index",
+    ylabel = "Value")
+scatter!(p1, correct_w_u, label = "correct")
+p2 = scatter(gs_w_v,
+    label = "learned",
+    title = "Comparison NN_v coefficients",
+    xlabel = "Index",
+    ylabel = "Value")
+scatter!(p2, correct_w_v, label = "correct")
+p = plot(p1, p2, layout = (2, 1))
+display(p)
+```
+
+The learned weights look perfect, let's check what happens if we use them to solve the GS model.
+
+### Comparison: CNODE vs exact solutions
+
+```julia
+trange = (0.0, 600)
+dt, saveat = (1, 5)
+```
+
+Exact solution
+
+```julia
+f_exact = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false)
+θ_e, st_e = Lux.setup(rng, f_exact);
+exact_CNODE = NeuralODE(f_exact,
+    trange,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+exact_CNODE_solution = Array(exact_CNODE(GS_sim[:, 1:2, 1], θ_e, st_e)[1]);
+u = reshape(exact_CNODE_solution[1:(grid_GS.Nu), :, :],
+    grid_GS.nux,
+    grid_GS.nuy,
+    size(exact_CNODE_solution, 2),
+    :);
+v = reshape(exact_CNODE_solution[(grid_GS.Nu + 1):end, :, :],
+    grid_GS.nvx,
+    grid_GS.nvy,
+    size(exact_CNODE_solution, 2),
+    :);
+```
+
+Trained solution
+
+```julia
+trained_CNODE = NeuralODE(f_closed_CNODE,
+    trange,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+trained_CNODE_solution = Array(trained_CNODE(GS_sim[:, 1:2, 1], θ, st)[1]);
+u_trained = reshape(trained_CNODE_solution[1:(grid_GS.Nu), :, :],
+    grid_GS.nux,
+    grid_GS.nuy,
+    size(trained_CNODE_solution, 2),
+    :);
+v_trained = reshape(trained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],
+    grid_GS.nvx,
+    grid_GS.nvy,
+    size(trained_CNODE_solution, 2),
+    :);
+```
+
+Untrained solution
+
+```julia
+f_u = create_f_CNODE(F_u_open, G_v_open, grid_GS, NN_u, NN_v; is_closed = false)
+θ_u, st_u = Lux.setup(rng, f_u);
+untrained_CNODE = NeuralODE(f_u,
+    trange,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+untrained_CNODE_solution = Array(untrained_CNODE(GS_sim[:, 1:2, 1], θ_u, st_u)[1]);
+u_untrained = reshape(untrained_CNODE_solution[1:(grid_GS.Nu), :, :],
+    grid_GS.nux,
+    grid_GS.nuy,
+    size(untrained_CNODE_solution, 2),
+    :);
+v_untrained = reshape(untrained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],
+    grid_GS.nvx,
+    grid_GS.nvy,
+    size(untrained_CNODE_solution, 2),
+    :);
+```
+
+### Plot the results
+
+```julia
+anim = Animation()
+fig = plot(layout = (2, 6), size = (1200, 400))
+@gif for i in 1:1:size(u_trained, 4)
+    # First row: set of parameters 1
+    p1 = heatmap(u[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Exact")
+    p2 = heatmap(v[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p3 = heatmap(u_untrained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Untrained")
+    p4 = heatmap(v_untrained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p5 = heatmap(u_trained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Trained")
+    p6 = heatmap(v_trained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+
+    # Second row: set of parameters 2
+    p7 = heatmap(u[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Exact")
+    p8 = heatmap(v[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p9 = heatmap(u_untrained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Untrained")
+    p10 = heatmap(v_untrained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p11 = heatmap(u_trained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Trained")
+    p12 = heatmap(v_trained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+
+    fig = plot(p1,
+        p2,
+        p3,
+        p4,
+        p5,
+        p6,
+        p7,
+        p8,
+        p9,
+        p10,
+        p11,
+        p12,
+        layout = (2, 6),
+        margin = 0mm)
+    frame(anim, fig)
+end
+```
+
+Save the generated .gif
+
+```julia
+if isdir("./plots")
+    gif(anim, "./plots/02.02-trained_GS.gif", fps = 10)
+else
+    gif(anim, "examples/plots/02.02-trained_GS.gif", fps = 10)
+end
+```
+
+---
+
+*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
+
diff --git a/examples/02.03-GrayScott.ipynb b/examples/02.03-GrayScott.ipynb
new file mode 100644
index 0000000..d8911cf
--- /dev/null
+++ b/examples/02.03-GrayScott.ipynb
@@ -0,0 +1,724 @@
+{
+ "cells": [
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import CUDA\n",
+    "ArrayType = CUDA.functional() ? CuArray : Array;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Learning the Gray-Scott model: Effect of grid coarsening"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In this example we want to show the effect of grid coarsening on the solution of a PDE.\n",
+    "We will introduce one of the most important problems in the numerical solution of PDEs, that we will try to solve in the following examples using CNODEs."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We use again the GS model, which is defined from\n",
+    "\\begin{equation}\\begin{cases} \\frac{du}{dt} = D_u \\Delta u - uv^2 + f(1-u)  \\equiv F_u(u,v) \\\\ \\frac{dv}{dt} = D_v \\Delta v + uv^2 - (f+k)v  \\equiv G_v(u,v)\\end{cases} \\end{equation}\n",
+    "where $u(x,y,t):\\mathbb{R}^2\\times \\mathbb{R}\\rightarrow \\mathbb{R}$ is the concentration of species 1, while $v(x,y,t)$ is the concentration of species two. This model reproduce the effect of the two species diffusing in their environment, and reacting together.\n",
+    "This effect is captured by the ratios between $D_u$ and $D_v$ (the diffusion coefficients) and $f$ and $k$ (the reaction rates)."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### The *exact* solution\n",
+    "Even if the GS model can not be solved analytically, we can discretize it on a very fine grid and expect its solution to be almost exact.\n",
+    "We will use it as a reference to compare the solution on a coarser grid.\n",
+    "Notice that the simple fact that we are discretizing, makes this solution technically a DNS (Direct Numerical Simulation) and not an exact solution, but since we are using a very fine grid, we will call it *exact* for simplicity."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's define the finest grid using 200 steps of 0.5 in each direction, reaching a 100[L] x 100[L] domain."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_NODE.jl\")\n",
+    "dux = duy = dvx = dvy = 0.5\n",
+    "nux = nuy = nvx = nvy = 200\n",
+    "grid_GS = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We start with a central concentration of $v$"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "function initialize_uv(grid, u_bkg, v_bkg, center_size)\n",
+    "    u_initial = u_bkg * ones(grid.nux, grid.nuy)\n",
+    "    v_initial = zeros(grid.nvx, grid.nvy)\n",
+    "    v_initial[Int(grid.nvx / 2 - center_size):Int(grid.nvx / 2 + center_size), Int(grid.nvy / 2 - center_size):Int(grid.nvy / 2 + center_size)] .= v_bkg\n",
+    "    return u_initial, v_initial\n",
+    "end\n",
+    "u_initial, v_initial = initialize_uv(grid_GS, 0.8, 0.9, 4);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can now define the initial condition as a flattened concatenated array"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = vcat(reshape(u_initial, grid_GS.nux * grid_GS.nuy, 1),\n",
+    "    reshape(v_initial, grid_GS.nvx * grid_GS.nvy, 1))"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "From the literature, we have selected the following parameters in order to form nice patterns"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "D_u = 0.16\n",
+    "D_v = 0.08\n",
+    "f = 0.055\n",
+    "k = 0.062;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Here we (the user) define the **right hand sides** of the equations"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_FDderivatives.jl\");\n",
+    "F_u(u, v, grid) = D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)\n",
+    "G_v(u, v, grid) = D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2 .- (f + k) .* v"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Once the forces have been defined, we can create the CNODE"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_CNODE = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and we ask Lux for the parameters to train and their structure [none in this example]"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import Random, Lux\n",
+    "rng = Random.seed!(1234)\n",
+    "θ, st = Lux.setup(rng, f_CNODE);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Actuallly, we are not training any parameters, but using `NeuralODE` for consistency with the resto of examples. Therefore, we see that $\\theta$ is empty."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "length(θ)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We now do a short *burnout run* to get rid of the initial artifacts"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import DifferentialEquations: Tsit5\n",
+    "import DiffEqFlux: NeuralODE\n",
+    "trange_burn = (0.0, 10.0)\n",
+    "dt_burn, saveat_burn = (1e-2, 1)\n",
+    "full_CNODE = NeuralODE(f_CNODE,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt_burn,\n",
+    "    saveat = saveat_burn)\n",
+    "burnout_CNODE_solution = Array(full_CNODE(uv0, θ, st)[1])"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "**CNODE run**\n",
+    "We use the output of the burnout to start a longer simulations"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = burnout_CNODE_solution[:, :, end];\n",
+    "trange = (0.0, 7000.0)\n",
+    "dt, saveat = (0.5, 20)\n",
+    "full_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat)\n",
+    "untrained_CNODE_solution = Array(full_CNODE(uv0, θ, st)[1])"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "And we unpack the solution to get the two species from"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "u_exact = reshape(untrained_CNODE_solution[1:(grid_GS.Nu), :, :],\n",
+    "    grid_GS.nux,\n",
+    "    grid_GS.nuy,\n",
+    "    size(untrained_CNODE_solution, 2),\n",
+    "    :)\n",
+    "v_exact = reshape(untrained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],\n",
+    "    grid_GS.nvx,\n",
+    "    grid_GS.nvy,\n",
+    "    size(untrained_CNODE_solution, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Plot the solution as an animation"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using Plots\n",
+    "anim = Animation()\n",
+    "fig = plot(layout = (1, 2), size = (600, 300))\n",
+    "@gif for i in 1:2:size(u_exact, 4)\n",
+    "    p1 = heatmap(u_exact[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        bar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u(x,y)\")\n",
+    "    p2 = heatmap(v_exact[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        bar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v(x,y)\")\n",
+    "    time = round(i * saveat, digits = 0)\n",
+    "    fig = plot(p1, p2, layout = (1, 2), plot_title = \"time = $(time)\")\n",
+    "    frame(anim, fig)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### **DNS**: Direct Numerical Simulation\n",
+    "The DNS is a refined solution of the PDE, where the grid is so fine that the solution is almost exact.\n",
+    "Technically, the *exact solution* in the previous section is also a DNS (because it is discrete), but we want to show that the grid can be made a bit coarser while preserving the dynamics of the original PDE.\n",
+    "This is because we want an efficient DNS that we can run to collect data for analysis, prediction and to train ML models."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The DNS grid will consists of 150 steps of 100 in each direction, covering the 100[L] x 100[L] domain."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dux = duy = dvx = dvy = 100 / 150\n",
+    "nux = nuy = nvx = nvy = 150\n",
+    "dns_grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Use the same initial condition as the exact solution"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "import Images: imresize\n",
+    "u0_dns = imresize(u_initial, (dns_grid.nux, dns_grid.nuy));\n",
+    "v0_dns = imresize(v_initial, (dns_grid.nvx, dns_grid.nvy));\n",
+    "uv0_dns = vcat(reshape(u0_dns, dns_grid.nux * dns_grid.nuy, 1),\n",
+    "    reshape(v0_dns, dns_grid.nvx * dns_grid.nvy, 1))"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "define the forces and create the CNODE"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_dns = create_f_CNODE(F_u, G_v, dns_grid; is_closed = false)\n",
+    "θ, st = Lux.setup(rng, f_dns);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "burnout run"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dns_CNODE = NeuralODE(f_dns,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt_burn,\n",
+    "    saveat = saveat_burn)\n",
+    "burnout_dns = Array(dns_CNODE(uv0_dns, θ, st)[1])"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "DNS simulation"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = burnout_dns[:, :, end];\n",
+    "dns_CNODE = NeuralODE(f_dns, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat)\n",
+    "dns_solution = Array(dns_CNODE(uv0, θ, st)[1])\n",
+    "u_dns = reshape(dns_solution[1:(dns_grid.Nu), :, :],\n",
+    "    dns_grid.nux,\n",
+    "    dns_grid.nuy,\n",
+    "    size(dns_solution, 2),\n",
+    "    :)\n",
+    "v_dns = reshape(dns_solution[(dns_grid.Nu + 1):end, :, :],\n",
+    "    dns_grid.nvx,\n",
+    "    dns_grid.nvy,\n",
+    "    size(dns_solution, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Plot DNS vs exact solution"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "anim = Animation()\n",
+    "fig = plot(layout = (2, 2), size = (600, 600))\n",
+    "@gif for i in 1:2:size(u_exact, 4)\n",
+    "    p1 = heatmap(u_exact[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        bar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u(x,y)\")\n",
+    "    p2 = heatmap(v_exact[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        bar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v(x,y)\")\n",
+    "    p3 = heatmap(u_dns[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        bar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u(x,y)\")\n",
+    "    p4 = heatmap(v_dns[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        bar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v(x,y)\")\n",
+    "    time = round(i * saveat, digits = 0)\n",
+    "    fig = plot(p1, p2, p3, p4, layout = (2, 2), plot_title = \"time = $(time)\")\n",
+    "    frame(anim, fig)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### **LES**: Large Eddy Simulation\n",
+    "The LES is a coarser solution of the PDE, where the grid is so coarse that the solution is not exact, but it still captures the main features of the original PDE.\n",
+    "It is used to reduce the computational cost of the DNS such that we can run it for longer.\n",
+    "However we will see that what it saves in computational cost, it loses in accuracy.\n",
+    "In the following examples, the goal of the CNODE will be to correct the LES solution to make it more accurate."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "This is the grid we will use for the LES, with 75 steps of 100/75[L] in each direction, covering the 100[L] x 100[L] domain."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dux = duy = dvx = dvy = 100 / 75\n",
+    "nux = nuy = nvx = nvy = 75\n",
+    "les_grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Use the same initial condition as the exact solution"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "u0_les = imresize(u_initial, (les_grid.nux, les_grid.nuy));\n",
+    "v0_les = imresize(v_initial, (les_grid.nvx, les_grid.nvy));\n",
+    "uv0_les = vcat(reshape(u0_les, les_grid.nux * les_grid.nuy, 1),\n",
+    "    reshape(v0_les, les_grid.nvx * les_grid.nvy, 1))"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Compare the initial conditions"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "p1 = heatmap(u_initial,\n",
+    "    axis = false,\n",
+    "    cbar = false,\n",
+    "    aspect_ratio = 1,\n",
+    "    title = \"u exact\",\n",
+    "    color = :reds)\n",
+    "p2 = heatmap(v_initial,\n",
+    "    axis = false,\n",
+    "    cbar = false,\n",
+    "    aspect_ratio = 1,\n",
+    "    title = \"v exact\",\n",
+    "    color = :blues)\n",
+    "p3 = heatmap(u0_dns,\n",
+    "    axis = false,\n",
+    "    cbar = false,\n",
+    "    aspect_ratio = 1,\n",
+    "    title = \"u_0 DNS\",\n",
+    "    color = :reds)\n",
+    "p4 = heatmap(v0_dns,\n",
+    "    axis = false,\n",
+    "    cbar = false,\n",
+    "    aspect_ratio = 1,\n",
+    "    title = \"v_0 DNS\",\n",
+    "    color = :blues)\n",
+    "p5 = heatmap(u0_les,\n",
+    "    axis = false,\n",
+    "    cbar = false,\n",
+    "    aspect_ratio = 1,\n",
+    "    title = \"u_0 LES\",\n",
+    "    color = :reds)\n",
+    "p6 = heatmap(v0_les,\n",
+    "    axis = false,\n",
+    "    cbar = false,\n",
+    "    aspect_ratio = 1,\n",
+    "    title = \"v_0 LES\",\n",
+    "    color = :blues)\n",
+    "plot(p1, p2, p3, p4, p5, p6, layout = (3, 2), plot_title = \"Initial conditions\")"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "define the forces and create the CNODE"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_les = create_f_CNODE(F_u, G_v, les_grid; is_closed = false)\n",
+    "θ, st = Lux.setup(rng, f_les);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "burnout run"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "les_CNODE = NeuralODE(f_les,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt_burn,\n",
+    "    saveat = saveat_burn)\n",
+    "burnout_les = Array(les_CNODE(uv0_les, θ, st)[1])"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "LES simulation"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = burnout_les[:, :, end];\n",
+    "les_CNODE = NeuralODE(f_les, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat)\n",
+    "les_solution = Array(les_CNODE(uv0, θ, st)[1])"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "And we unpack the solution to get the two species from"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "u_les = reshape(les_solution[1:(les_grid.Nu), :, :],\n",
+    "    les_grid.nux,\n",
+    "    les_grid.nuy,\n",
+    "    size(les_solution, 2),\n",
+    "    :)\n",
+    "v_les = reshape(les_solution[(les_grid.Nu + 1):end, :, :],\n",
+    "    les_grid.nvx,\n",
+    "    les_grid.nvy,\n",
+    "    size(les_solution, 2),\n",
+    "    :);\n",
+    "\n",
+    "anim = Animation()\n",
+    "fig = plot(layout = (3, 2), size = (600, 900))\n",
+    "@gif for i in 1:2:size(u_exact, 4)\n",
+    "    p1 = heatmap(u_exact[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u exact\")\n",
+    "    p2 = heatmap(v_exact[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v exact\")\n",
+    "    p3 = heatmap(u_dns[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u DNS\")\n",
+    "    p4 = heatmap(v_dns[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v DNS\")\n",
+    "    p5 = heatmap(u_les[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u LES\")\n",
+    "    p6 = heatmap(v_les[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v LES\")\n",
+    "    time = round(i * saveat, digits = 0)\n",
+    "    fig = plot(p1, p2, p3, p4, p5, p6, layout = (3, 2), plot_title = \"time = $(time)\")\n",
+    "    frame(anim, fig)\n",
+    "end\n",
+    "if isdir(\"./plots\")\n",
+    "    gif(anim, \"plots/02.03_gridsize.gif\", fps = 10)\n",
+    "else\n",
+    "    gif(anim, \"examples/plots/02.03_gridsize.gif\", fps = 10)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In the figure we see that the LES has induced some artifacts that influences the dynamics. In the next example, we will solve these artifacts using the Neural part of the CNODEs."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "\n",
+    "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
+   ],
+   "metadata": {}
+  }
+ ],
+ "nbformat_minor": 3,
+ "metadata": {
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.10.2"
+  },
+  "kernelspec": {
+   "name": "julia-1.10",
+   "display_name": "Julia 1.10.2",
+   "language": "julia"
+  }
+ },
+ "nbformat": 4
+}
diff --git a/examples/02.03-GrayScott.md b/examples/02.03-GrayScott.md
new file mode 100644
index 0000000..04868cc
--- /dev/null
+++ b/examples/02.03-GrayScott.md
@@ -0,0 +1,410 @@
+```julia
+import CUDA
+ArrayType = CUDA.functional() ? CuArray : Array;
+```
+
+# Learning the Gray-Scott model: Effect of grid coarsening
+
+In this example we want to show the effect of grid coarsening on the solution of a PDE.
+We will introduce one of the most important problems in the numerical solution of PDEs, that we will try to solve in the following examples using CNODEs.
+
+We use again the GS model, which is defined from
+\begin{equation}\begin{cases} \frac{du}{dt} = D_u \Delta u - uv^2 + f(1-u)  \equiv F_u(u,v) \\ \frac{dv}{dt} = D_v \Delta v + uv^2 - (f+k)v  \equiv G_v(u,v)\end{cases} \end{equation}
+where $u(x,y,t):\mathbb{R}^2\times \mathbb{R}\rightarrow \mathbb{R}$ is the concentration of species 1, while $v(x,y,t)$ is the concentration of species two. This model reproduce the effect of the two species diffusing in their environment, and reacting together.
+This effect is captured by the ratios between $D_u$ and $D_v$ (the diffusion coefficients) and $f$ and $k$ (the reaction rates).
+
+### The *exact* solution
+Even if the GS model can not be solved analytically, we can discretize it on a very fine grid and expect its solution to be almost exact.
+We will use it as a reference to compare the solution on a coarser grid.
+Notice that the simple fact that we are discretizing, makes this solution technically a DNS (Direct Numerical Simulation) and not an exact solution, but since we are using a very fine grid, we will call it *exact* for simplicity.
+
+Let's define the finest grid using 200 steps of 0.5 in each direction, reaching a 100[L] x 100[L] domain.
+
+```julia
+include("coupling_functions/functions_NODE.jl")
+dux = duy = dvx = dvy = 0.5
+nux = nuy = nvx = nvy = 200
+grid_GS = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);
+```
+
+We start with a central concentration of $v$
+
+```julia
+function initialize_uv(grid, u_bkg, v_bkg, center_size)
+    u_initial = u_bkg * ones(grid.nux, grid.nuy)
+    v_initial = zeros(grid.nvx, grid.nvy)
+    v_initial[Int(grid.nvx / 2 - center_size):Int(grid.nvx / 2 + center_size), Int(grid.nvy / 2 - center_size):Int(grid.nvy / 2 + center_size)] .= v_bkg
+    return u_initial, v_initial
+end
+u_initial, v_initial = initialize_uv(grid_GS, 0.8, 0.9, 4);
+```
+
+We can now define the initial condition as a flattened concatenated array
+
+```julia
+uv0 = vcat(reshape(u_initial, grid_GS.nux * grid_GS.nuy, 1),
+    reshape(v_initial, grid_GS.nvx * grid_GS.nvy, 1))
+```
+
+From the literature, we have selected the following parameters in order to form nice patterns
+
+```julia
+D_u = 0.16
+D_v = 0.08
+f = 0.055
+k = 0.062;
+```
+
+Here we (the user) define the **right hand sides** of the equations
+
+```julia
+include("coupling_functions/functions_FDderivatives.jl");
+F_u(u, v, grid) = D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)
+G_v(u, v, grid) = D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2 .- (f + k) .* v
+```
+
+Once the forces have been defined, we can create the CNODE
+
+```julia
+f_CNODE = create_f_CNODE(F_u, G_v, grid_GS; is_closed = false)
+```
+
+and we ask Lux for the parameters to train and their structure [none in this example]
+
+```julia
+import Random, Lux
+rng = Random.seed!(1234)
+θ, st = Lux.setup(rng, f_CNODE);
+```
+
+Actuallly, we are not training any parameters, but using `NeuralODE` for consistency with the resto of examples. Therefore, we see that $\theta$ is empty.
+
+```julia
+length(θ)
+```
+
+We now do a short *burnout run* to get rid of the initial artifacts
+
+```julia
+import DifferentialEquations: Tsit5
+import DiffEqFlux: NeuralODE
+trange_burn = (0.0, 10.0)
+dt_burn, saveat_burn = (1e-2, 1)
+full_CNODE = NeuralODE(f_CNODE,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt_burn,
+    saveat = saveat_burn)
+burnout_CNODE_solution = Array(full_CNODE(uv0, θ, st)[1])
+```
+
+**CNODE run**
+We use the output of the burnout to start a longer simulations
+
+```julia
+uv0 = burnout_CNODE_solution[:, :, end];
+trange = (0.0, 7000.0)
+dt, saveat = (0.5, 20)
+full_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat)
+untrained_CNODE_solution = Array(full_CNODE(uv0, θ, st)[1])
+```
+
+And we unpack the solution to get the two species from
+
+```julia
+u_exact = reshape(untrained_CNODE_solution[1:(grid_GS.Nu), :, :],
+    grid_GS.nux,
+    grid_GS.nuy,
+    size(untrained_CNODE_solution, 2),
+    :)
+v_exact = reshape(untrained_CNODE_solution[(grid_GS.Nu + 1):end, :, :],
+    grid_GS.nvx,
+    grid_GS.nvy,
+    size(untrained_CNODE_solution, 2),
+    :);
+```
+
+Plot the solution as an animation
+
+```julia
+using Plots
+anim = Animation()
+fig = plot(layout = (1, 2), size = (600, 300))
+@gif for i in 1:2:size(u_exact, 4)
+    p1 = heatmap(u_exact[:, :, 1, i],
+        axis = false,
+        bar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u(x,y)")
+    p2 = heatmap(v_exact[:, :, 1, i],
+        axis = false,
+        bar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v(x,y)")
+    time = round(i * saveat, digits = 0)
+    fig = plot(p1, p2, layout = (1, 2), plot_title = "time = $(time)")
+    frame(anim, fig)
+end
+```
+
+### **DNS**: Direct Numerical Simulation
+The DNS is a refined solution of the PDE, where the grid is so fine that the solution is almost exact.
+Technically, the *exact solution* in the previous section is also a DNS (because it is discrete), but we want to show that the grid can be made a bit coarser while preserving the dynamics of the original PDE.
+This is because we want an efficient DNS that we can run to collect data for analysis, prediction and to train ML models.
+
+The DNS grid will consists of 150 steps of 100 in each direction, covering the 100[L] x 100[L] domain.
+
+```julia
+dux = duy = dvx = dvy = 100 / 150
+nux = nuy = nvx = nvy = 150
+dns_grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);
+```
+
+Use the same initial condition as the exact solution
+
+```julia
+import Images: imresize
+u0_dns = imresize(u_initial, (dns_grid.nux, dns_grid.nuy));
+v0_dns = imresize(v_initial, (dns_grid.nvx, dns_grid.nvy));
+uv0_dns = vcat(reshape(u0_dns, dns_grid.nux * dns_grid.nuy, 1),
+    reshape(v0_dns, dns_grid.nvx * dns_grid.nvy, 1))
+```
+
+define the forces and create the CNODE
+
+```julia
+f_dns = create_f_CNODE(F_u, G_v, dns_grid; is_closed = false)
+θ, st = Lux.setup(rng, f_dns);
+```
+
+burnout run
+
+```julia
+dns_CNODE = NeuralODE(f_dns,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt_burn,
+    saveat = saveat_burn)
+burnout_dns = Array(dns_CNODE(uv0_dns, θ, st)[1])
+```
+
+DNS simulation
+
+```julia
+uv0 = burnout_dns[:, :, end];
+dns_CNODE = NeuralODE(f_dns, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat)
+dns_solution = Array(dns_CNODE(uv0, θ, st)[1])
+u_dns = reshape(dns_solution[1:(dns_grid.Nu), :, :],
+    dns_grid.nux,
+    dns_grid.nuy,
+    size(dns_solution, 2),
+    :)
+v_dns = reshape(dns_solution[(dns_grid.Nu + 1):end, :, :],
+    dns_grid.nvx,
+    dns_grid.nvy,
+    size(dns_solution, 2),
+    :);
+```
+
+Plot DNS vs exact solution
+
+```julia
+anim = Animation()
+fig = plot(layout = (2, 2), size = (600, 600))
+@gif for i in 1:2:size(u_exact, 4)
+    p1 = heatmap(u_exact[:, :, 1, i],
+        axis = false,
+        bar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u(x,y)")
+    p2 = heatmap(v_exact[:, :, 1, i],
+        axis = false,
+        bar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v(x,y)")
+    p3 = heatmap(u_dns[:, :, 1, i],
+        axis = false,
+        bar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u(x,y)")
+    p4 = heatmap(v_dns[:, :, 1, i],
+        axis = false,
+        bar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v(x,y)")
+    time = round(i * saveat, digits = 0)
+    fig = plot(p1, p2, p3, p4, layout = (2, 2), plot_title = "time = $(time)")
+    frame(anim, fig)
+end
+```
+
+### **LES**: Large Eddy Simulation
+The LES is a coarser solution of the PDE, where the grid is so coarse that the solution is not exact, but it still captures the main features of the original PDE.
+It is used to reduce the computational cost of the DNS such that we can run it for longer.
+However we will see that what it saves in computational cost, it loses in accuracy.
+In the following examples, the goal of the CNODE will be to correct the LES solution to make it more accurate.
+
+This is the grid we will use for the LES, with 75 steps of 100/75[L] in each direction, covering the 100[L] x 100[L] domain.
+
+```julia
+dux = duy = dvx = dvy = 100 / 75
+nux = nuy = nvx = nvy = 75
+les_grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);
+```
+
+Use the same initial condition as the exact solution
+
+```julia
+u0_les = imresize(u_initial, (les_grid.nux, les_grid.nuy));
+v0_les = imresize(v_initial, (les_grid.nvx, les_grid.nvy));
+uv0_les = vcat(reshape(u0_les, les_grid.nux * les_grid.nuy, 1),
+    reshape(v0_les, les_grid.nvx * les_grid.nvy, 1))
+```
+
+Compare the initial conditions
+
+```julia
+p1 = heatmap(u_initial,
+    axis = false,
+    cbar = false,
+    aspect_ratio = 1,
+    title = "u exact",
+    color = :reds)
+p2 = heatmap(v_initial,
+    axis = false,
+    cbar = false,
+    aspect_ratio = 1,
+    title = "v exact",
+    color = :blues)
+p3 = heatmap(u0_dns,
+    axis = false,
+    cbar = false,
+    aspect_ratio = 1,
+    title = "u_0 DNS",
+    color = :reds)
+p4 = heatmap(v0_dns,
+    axis = false,
+    cbar = false,
+    aspect_ratio = 1,
+    title = "v_0 DNS",
+    color = :blues)
+p5 = heatmap(u0_les,
+    axis = false,
+    cbar = false,
+    aspect_ratio = 1,
+    title = "u_0 LES",
+    color = :reds)
+p6 = heatmap(v0_les,
+    axis = false,
+    cbar = false,
+    aspect_ratio = 1,
+    title = "v_0 LES",
+    color = :blues)
+plot(p1, p2, p3, p4, p5, p6, layout = (3, 2), plot_title = "Initial conditions")
+```
+
+define the forces and create the CNODE
+
+```julia
+f_les = create_f_CNODE(F_u, G_v, les_grid; is_closed = false)
+θ, st = Lux.setup(rng, f_les);
+```
+
+burnout run
+
+```julia
+les_CNODE = NeuralODE(f_les,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt_burn,
+    saveat = saveat_burn)
+burnout_les = Array(les_CNODE(uv0_les, θ, st)[1])
+```
+
+LES simulation
+
+```julia
+uv0 = burnout_les[:, :, end];
+les_CNODE = NeuralODE(f_les, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat)
+les_solution = Array(les_CNODE(uv0, θ, st)[1])
+```
+
+And we unpack the solution to get the two species from
+
+```julia
+u_les = reshape(les_solution[1:(les_grid.Nu), :, :],
+    les_grid.nux,
+    les_grid.nuy,
+    size(les_solution, 2),
+    :)
+v_les = reshape(les_solution[(les_grid.Nu + 1):end, :, :],
+    les_grid.nvx,
+    les_grid.nvy,
+    size(les_solution, 2),
+    :);
+
+anim = Animation()
+fig = plot(layout = (3, 2), size = (600, 900))
+@gif for i in 1:2:size(u_exact, 4)
+    p1 = heatmap(u_exact[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u exact")
+    p2 = heatmap(v_exact[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v exact")
+    p3 = heatmap(u_dns[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u DNS")
+    p4 = heatmap(v_dns[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v DNS")
+    p5 = heatmap(u_les[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u LES")
+    p6 = heatmap(v_les[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v LES")
+    time = round(i * saveat, digits = 0)
+    fig = plot(p1, p2, p3, p4, p5, p6, layout = (3, 2), plot_title = "time = $(time)")
+    frame(anim, fig)
+end
+if isdir("./plots")
+    gif(anim, "plots/02.03_gridsize.gif", fps = 10)
+else
+    gif(anim, "examples/plots/02.03_gridsize.gif", fps = 10)
+end
+```
+
+In the figure we see that the LES has induced some artifacts that influences the dynamics. In the next example, we will solve these artifacts using the Neural part of the CNODEs.
+
+---
+
+*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
+
diff --git a/examples/TODO_02.04-GrayScott.ipynb b/examples/TODO_02.04-GrayScott.ipynb
new file mode 100644
index 0000000..9db1f08
--- /dev/null
+++ b/examples/TODO_02.04-GrayScott.ipynb
@@ -0,0 +1,954 @@
+{
+ "cells": [
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using Lux\n",
+    "using SciMLSensitivity\n",
+    "using DiffEqFlux\n",
+    "using DifferentialEquations\n",
+    "using Plots\n",
+    "using Plots.PlotMeasures\n",
+    "using Zygote\n",
+    "using Random\n",
+    "rng = Random.seed!(1234)\n",
+    "using OptimizationOptimisers\n",
+    "using Statistics\n",
+    "using ComponentArrays\n",
+    "using CUDA\n",
+    "using Images\n",
+    "using Interpolations\n",
+    "using NNlib\n",
+    "using FFTW"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Test if CUDA is running"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "CUDA.functional()\n",
+    "\n",
+    "CUDA.allowscalar(false)\n",
+    "ArrayType = CUDA.functional() ? CuArray : Array;\n",
+    "z = CUDA.functional() ? CUDA.zeros : (s...) -> zeros(Float32, s...)\n",
+    "# Import our custom backend functions\n",
+    "include(\"coupling_functions/functions_example.jl\")\n",
+    "include(\"coupling_functions/functions_NODE.jl\")\n",
+    "include(\"coupling_functions/functions_CNODE_loss.jl\")\n",
+    "include(\"coupling_functions/functions_FDderivatives.jl\");\n",
+    "include(\"coupling_functions/functions_nn.jl\")\n",
+    "include(\"coupling_functions/functions_FNO.jl\")"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Learning the Gray-Scott model"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In this example, we want to learn a closure to the Gray-Scott model using a Neural Network. Here we use a **coarser grid only on v**, while we keep the **fine grid for u**."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We run multiple GS simulations as discussed in the previous part.\n",
+    "Notice that the 'fine' grid is now only 40 cells per side, in order to speed up the example"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dux = duy = dvx = dvy = 1.0\n",
+    "nux = nuy = nvx = nvy = 40\n",
+    "grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Here, we define the initial condition as a random perturbation over a constant background to add variety"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "function initial_condition(grid, U₀, V₀, ε_u, ε_v; nsimulations = 1)\n",
+    "    u_init = U₀ .+ ε_u .* randn(grid.nux, grid.nuy, nsimulations)\n",
+    "    v_init = V₀ .+ ε_v .* randn(grid.nvx, grid.nvy, nsimulations)\n",
+    "    return u_init, v_init\n",
+    "end\n",
+    "U₀ = 0.5    # initial concentration of u\n",
+    "V₀ = 0.25   # initial concentration of v\n",
+    "ε_u = 0.05 # magnitude of the perturbation on u\n",
+    "ε_v = 0.1 # magnitude of the perturbation on v\n",
+    "u_initial, v_initial = initial_condition(grid, U₀, V₀, ε_u, ε_v, nsimulations = 4);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can now define the initial condition as a flattened concatenated array"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = vcat(reshape(u_initial, grid.Nu, :), reshape(v_initial, grid.nvx * grid.nvy, :));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "From the literature, we have selected the following parameters in order to form nice patterns"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "D_u = 0.16\n",
+    "D_v = 0.08\n",
+    "f = 0.055\n",
+    "k = 0.062;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "RHS of GS model"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "F_u(u, v, grid) = D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)\n",
+    "G_v(u, v, grid) = D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2 .- (f + k) .* v"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and definition of the model"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_CNODE = create_f_CNODE(F_u, G_v, grid; is_closed = false);\n",
+    "θ, st = Lux.setup(rng, f_CNODE);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Short *burnout run* to get rid of the initial artifacts"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trange_burn = (0.0f0, 50.0f0)\n",
+    "dt, saveat = (1e-2, 1)\n",
+    "full_CNODE = NeuralODE(f_CNODE,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "burnout_data = Array(full_CNODE(uv0, θ, st)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Second burnout with larger timesteps"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trange_burn = (0.0f0, 100.0f0)\n",
+    "dt, saveat = (1, 50)\n",
+    "full_CNODE = NeuralODE(f_CNODE,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "burnout_data = Array(full_CNODE(burnout_data[:, :, end], θ, st)[1]);\n",
+    "u = reshape(burnout_data[1:(grid.Nu), :, :], grid.nux, grid.nuy, size(burnout_data, 2), :);\n",
+    "v = reshape(burnout_data[(grid.Nu + 1):end, :, :],\n",
+    "    grid.nvx,\n",
+    "    grid.nvy,\n",
+    "    size(burnout_data, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "**Data collection run**\n",
+    "We use the output of the burnout to start a longer simulations"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = burnout_data[:, :, end];\n",
+    "trange = (0.0f0, 2500.0f0)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "for this data production run, we set `dt=1` and we sample every step"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dt, saveat = (0.1, 0.1)\n",
+    "full_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat);\n",
+    "reference_data = Array(full_CNODE(uv0, θ, st)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "And we unpack the solution to get the two species from"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "u = reshape(reference_data[1:(grid.Nu), :, :],\n",
+    "    grid.nux,\n",
+    "    grid.nuy,\n",
+    "    size(reference_data, 2),\n",
+    "    :);\n",
+    "v = reshape(reference_data[(grid.Nu + 1):end, :, :],\n",
+    "    grid.nvx,\n",
+    "    grid.nvy,\n",
+    "    size(reference_data, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Plot the data"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "anim = Animation()\n",
+    "fig = plot(layout = (3, 2), size = (600, 900))\n",
+    "@gif for i in 1:1000:size(u, 4)\n",
+    "    p1 = heatmap(u[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u(x,y)\")\n",
+    "    p2 = heatmap(v[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v(x,y)\")\n",
+    "    p3 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)\n",
+    "    p4 = heatmap(v[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p5 = heatmap(u[:, :, 3, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)\n",
+    "    p6 = heatmap(v[:, :, 3, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    time = round(i * saveat, digits = 0)\n",
+    "    fig = plot(p1, p2, p3, p4, p5, p6, layout = (3, 2), plot_title = \"time = $(time)\")\n",
+    "    frame(anim, fig)\n",
+    "end\n",
+    "if isdir(\"./plots\")\n",
+    "    gif(anim, \"./plots/02.04-DNS.gif\", fps = 10)\n",
+    "else\n",
+    "    gif(anim, \"examples/plots/02.04-DNS.gif\", fps = 10)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Collect the data for the coarse grid\n",
+    "So now we redefine the grid parameters"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "nvx = nvy = 30\n",
+    "dvx = nux * dux / nvx\n",
+    "dvy = nuy * duy / nvy\n",
+    "coarse_grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "** Show what happens if you use a non-closed model on a smaller grid\n",
+    "(we are not using any NN here)"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_coarse_CNODE = create_f_CNODE(F_u, G_v, coarse_grid; is_closed = false)\n",
+    "θ, st = Lux.setup(rng, f_coarse_CNODE);\n",
+    "uv0 = burnout_data[:, :, end];\n",
+    "u0b = reshape(uv0[1:(grid.Nu), :], grid.nux, grid.nuy, :);\n",
+    "v0b = reshape(uv0[(grid.Nu + 1):end, :], grid.nvx, grid.nvy, :);\n",
+    "v0_coarse = imresize(v0b, (coarse_grid.nvx, coarse_grid.nvy));\n",
+    "uv0_coarse = vcat(reshape(u0b, coarse_grid.Nu, :), reshape(v0_coarse, coarse_grid.Nv, :));\n",
+    "closed_CNODE = NeuralODE(f_coarse_CNODE,\n",
+    "    trange,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "closed_CNODE_solution = Array(closed_CNODE(uv0_coarse, θ, st)[1]);\n",
+    "u_closed = reshape(closed_CNODE_solution[1:(coarse_grid.Nu), :, :],\n",
+    "    coarse_grid.nux,\n",
+    "    coarse_grid.nuy,\n",
+    "    size(closed_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v_closed = reshape(closed_CNODE_solution[(coarse_grid.Nu + 1):end, :, :],\n",
+    "    coarse_grid.nvx,\n",
+    "    coarse_grid.nvy,\n",
+    "    size(closed_CNODE_solution, 2),\n",
+    "    :);\n",
+    "anim = Animation()\n",
+    "fig = plot(layout = (3, 5), size = (500, 300))\n",
+    "@gif for i in 1:1000:size(u_closed, 4)\n",
+    "    p1 = heatmap(u_closed[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u(x,y) [C]\")\n",
+    "    p2 = heatmap(v_closed[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v(x,y) [C]\")\n",
+    "    p3 = heatmap(u[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u(x,y) [F]\")\n",
+    "    p4 = heatmap(v[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v(x,y) [F]\")\n",
+    "    e = u_closed[:, :, 1, i] .- u[:, :, 1, i]\n",
+    "    p5 = heatmap(e,\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :greens,\n",
+    "        title = \"u-Diff\")\n",
+    "    p6 = heatmap(u_closed[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds)\n",
+    "    p7 = heatmap(v_closed[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p8 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)\n",
+    "    p9 = heatmap(v[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    e = u_closed[:, :, 2, i] .- u[:, :, 2, i]\n",
+    "    p10 = heatmap(e,\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :greens,\n",
+    "        title = \"u-Diff\")\n",
+    "    p11 = heatmap(u_closed[:, :, 3, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds)\n",
+    "    p12 = heatmap(v_closed[:, :, 3, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p13 = heatmap(u[:, :, 3, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds)\n",
+    "    p14 = heatmap(v[:, :, 3, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    e = u_closed[:, :, 3, i] .- u[:, :, 3, i]\n",
+    "    p15 = heatmap(e,\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :greens,\n",
+    "        title = \"u-Diff\")\n",
+    "    time = round(i * saveat, digits = 0)\n",
+    "    fig = plot(p1,\n",
+    "        p2,\n",
+    "        p3,\n",
+    "        p4,\n",
+    "        p5,\n",
+    "        p6,\n",
+    "        p7,\n",
+    "        p8,\n",
+    "        p9,\n",
+    "        p10,\n",
+    "        p11,\n",
+    "        p12,\n",
+    "        p13,\n",
+    "        p14,\n",
+    "        p15,\n",
+    "        layout = (3, 5),\n",
+    "        plot_title = \"time = $(time)\")\n",
+    "    frame(anim, fig)\n",
+    "end\n",
+    "if isdir(\"./plots\")\n",
+    "    gif(anim, \"./plots/02.04-LES.gif\", fps = 10)\n",
+    "else\n",
+    "    gif(anim, \"examples/plots/02.04-LES.gif\", fps = 10)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In order to prepare the loss function, we compute from the simulation data the target that we would like to fit. In the example u will be unchanged, while v will be rescaled to the coarse grid"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "u_target = reshape(reference_data[1:(grid.Nu), :, :],\n",
+    "    grid.nux,\n",
+    "    grid.nuy,\n",
+    "    size(reference_data, 2),\n",
+    "    :);\n",
+    "v = reshape(reference_data[(grid.Nu + 1):end, :, :],\n",
+    "    grid.nvx,\n",
+    "    grid.nvy,\n",
+    "    size(reference_data, 2),\n",
+    "    :);\n",
+    "v_target = imresize(v, (coarse_grid.nvx, coarse_grid.nvy));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and pack them together in a target array where u and v are linearized in the first dimension"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "target = vcat(reshape(u_target, grid.Nu, size(u_target, 3), :),\n",
+    "    reshape(v_target, coarse_grid.Nv, size(v_target, 3), :));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and make it into float32"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "target = target |> f32;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Now create the CNODE with the Neural Network"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "ch_fno = [2, 5, 5, 5, 2];\n",
+    "kmax_fno = [8, 8, 8, 8];\n",
+    "σ_fno = [gelu, gelu, gelu, identity];\n",
+    "NN_u = create_fno_model(kmax_fno, ch_fno, σ_fno);\n",
+    "NN_v = create_fno_model(kmax_fno, ch_fno, σ_fno)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can now close the CNODE with the Neural Network"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_closed_CNODE = create_f_CNODE(F_u, G_v, coarse_grid, NN_u, NN_v; is_closed = true)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and make it into float32"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_closed_CNODE = f_closed_CNODE |> f32;\n",
+    "θ, st = Lux.setup(rng, f_closed_CNODE);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Design the **loss function**\n",
+    "For this example, we use *multishooting a posteriori* fitting (MulDtO), where we tell `Zygote` to compare `nintervals` of length `nunroll` to get the gradient. Notice that this method is differentiating through the solution of the NODE!"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "nunroll = 5\n",
+    "nintervals = 20\n",
+    "nsamples = 2"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We also define this auxiliary NODE that will be used for training\n",
+    "We can use smaller time steps for the training"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dt_train = 0.05"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "but we have to sample at the same rate as the data"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "saveat_train = saveat\n",
+    "t_train_range = (0.0f0, saveat_train * (nunroll + 0)) # it has to be as long as unroll\n",
+    "training_CNODE = NeuralODE(f_closed_CNODE,\n",
+    "    t_train_range,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt_train,\n",
+    "    saveat = saveat_train);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "* Create the loss"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "myloss = create_randloss_MulDtO(target,\n",
+    "    nunroll = nunroll,\n",
+    "    nintervals = nintervals,\n",
+    "    nsamples = nsamples,\n",
+    "    λ_c = 1e2,\n",
+    "    λ_l1 = 1e-1);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "To initialize the training, we need some objects to monitor the procedure, and we trigger the first compilation."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "lhist = Float32[];"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Initialize and trigger the compilation of the model"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "pinit = ComponentArray(θ);\n",
+    "myloss(pinit)  # trigger compilation"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "[!] Check that the loss does not get type warnings, otherwise it will be slower"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Select the autodifferentiation type"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "adtype = Optimization.AutoZygote();"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We transform the NeuralODE into an optimization problem"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "optf = Optimization.OptimizationFunction((x, p) -> myloss(x), adtype);\n",
+    "optprob = Optimization.OptimizationProblem(optf, pinit);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Select the training algorithm:\n",
+    "we choose Adam with learning rate 0.1, with gradient clipping"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "ClipAdam = OptimiserChain(Adam(1.0f-2), ClipGrad(1));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Finally we can train the NODE"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "result_neuralode = Optimization.solve(optprob,\n",
+    "    ClipAdam;\n",
+    "    callback = callback,\n",
+    "    maxiters = 3);\n",
+    "pinit = result_neuralode.u;\n",
+    "θ = pinit;\n",
+    "optprob = Optimization.OptimizationProblem(optf, pinit);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "(Notice that the block above can be repeated to continue training)"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "**Notice** that the training is rather slow, so realistically here you can not expect good results in a few iterations."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and finally I use the trained CNODE to compare the solution with the target"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trange = (0.0f0, 300.0f0)\n",
+    "trained_CNODE = NeuralODE(f_closed_CNODE,\n",
+    "    trange,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "trained_CNODE_solution = Array(trained_CNODE(uv0_coarse[:, 1:3], θ, st)[1]);\n",
+    "u_trained = reshape(trained_CNODE_solution[1:(coarse_grid.Nu), :, :],\n",
+    "    coarse_grid.nux,\n",
+    "    coarse_grid.nuy,\n",
+    "    size(trained_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v_trained = reshape(trained_CNODE_solution[(coarse_grid.Nu + 1):end, :, :],\n",
+    "    coarse_grid.nvx,\n",
+    "    coarse_grid.nvy,\n",
+    "    size(trained_CNODE_solution, 2),\n",
+    "    :);\n",
+    "anim = Animation()\n",
+    "fig = plot(layout = (2, 5), size = (750, 300))\n",
+    "@gif for i in 1:40:size(u_trained, 4)\n",
+    "    p1 = heatmap(u[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Exact\")\n",
+    "    p2 = heatmap(v[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p3 = heatmap(u_trained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Trained\")\n",
+    "    p4 = heatmap(v_trained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    et = abs.(u[:, :, 1, i] .- u_trained[:, :, 1, i])\n",
+    "    p5 = heatmap(et,\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :greens,\n",
+    "        title = \"Diff-u\")\n",
+    "    p6 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)\n",
+    "    p7 = heatmap(v[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p8 = heatmap(u_trained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds)\n",
+    "    p9 = heatmap(v_trained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    e = abs.(u[:, :, 2, i] .- u_trained[:, :, 2, i])\n",
+    "    p10 = heatmap(e, axis = false, cbar = false, aspect_ratio = 1, color = :greens)\n",
+    "\n",
+    "    time = round(i * saveat, digits = 0)\n",
+    "    fig = plot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, layout = (2, 5), margin = 0mm)\n",
+    "\n",
+    "    frame(anim, fig)\n",
+    "end\n",
+    "if isdir(\"./plots\")\n",
+    "    gif(anim, \"./plots/02.04-NNclosure.gif\", fps = 10)\n",
+    "else\n",
+    "    gif(anim, \"examples/plots/02.04-NNclosure.gif\", fps = 10)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "\n",
+    "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
+   ],
+   "metadata": {}
+  }
+ ],
+ "nbformat_minor": 3,
+ "metadata": {
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.10.2"
+  },
+  "kernelspec": {
+   "name": "julia-1.10",
+   "display_name": "Julia 1.10.2",
+   "language": "julia"
+  }
+ },
+ "nbformat": 4
+}
diff --git a/examples/TODO_02.04-GrayScott.md b/examples/TODO_02.04-GrayScott.md
new file mode 100644
index 0000000..208af30
--- /dev/null
+++ b/examples/TODO_02.04-GrayScott.md
@@ -0,0 +1,580 @@
+```julia
+using Lux
+using SciMLSensitivity
+using DiffEqFlux
+using DifferentialEquations
+using Plots
+using Plots.PlotMeasures
+using Zygote
+using Random
+rng = Random.seed!(1234)
+using OptimizationOptimisers
+using Statistics
+using ComponentArrays
+using CUDA
+using Images
+using Interpolations
+using NNlib
+using FFTW
+```
+
+Test if CUDA is running
+
+```julia
+CUDA.functional()
+
+CUDA.allowscalar(false)
+ArrayType = CUDA.functional() ? CuArray : Array;
+z = CUDA.functional() ? CUDA.zeros : (s...) -> zeros(Float32, s...)
+# Import our custom backend functions
+include("coupling_functions/functions_example.jl")
+include("coupling_functions/functions_NODE.jl")
+include("coupling_functions/functions_CNODE_loss.jl")
+include("coupling_functions/functions_FDderivatives.jl");
+include("coupling_functions/functions_nn.jl")
+include("coupling_functions/functions_FNO.jl")
+```
+
+## Learning the Gray-Scott model
+
+In this example, we want to learn a closure to the Gray-Scott model using a Neural Network. Here we use a **coarser grid only on v**, while we keep the **fine grid for u**.
+
+We run multiple GS simulations as discussed in the previous part.
+Notice that the 'fine' grid is now only 40 cells per side, in order to speed up the example
+
+```julia
+dux = duy = dvx = dvy = 1.0
+nux = nuy = nvx = nvy = 40
+grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);
+```
+
+Here, we define the initial condition as a random perturbation over a constant background to add variety
+
+```julia
+function initial_condition(grid, U₀, V₀, ε_u, ε_v; nsimulations = 1)
+    u_init = U₀ .+ ε_u .* randn(grid.nux, grid.nuy, nsimulations)
+    v_init = V₀ .+ ε_v .* randn(grid.nvx, grid.nvy, nsimulations)
+    return u_init, v_init
+end
+U₀ = 0.5    # initial concentration of u
+V₀ = 0.25   # initial concentration of v
+ε_u = 0.05 # magnitude of the perturbation on u
+ε_v = 0.1 # magnitude of the perturbation on v
+u_initial, v_initial = initial_condition(grid, U₀, V₀, ε_u, ε_v, nsimulations = 4);
+```
+
+We can now define the initial condition as a flattened concatenated array
+
+```julia
+uv0 = vcat(reshape(u_initial, grid.Nu, :), reshape(v_initial, grid.nvx * grid.nvy, :));
+```
+
+From the literature, we have selected the following parameters in order to form nice patterns
+
+```julia
+D_u = 0.16
+D_v = 0.08
+f = 0.055
+k = 0.062;
+```
+
+RHS of GS model
+
+```julia
+F_u(u, v, grid) = D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)
+G_v(u, v, grid) = D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2 .- (f + k) .* v
+```
+
+and definition of the model
+
+```julia
+f_CNODE = create_f_CNODE(F_u, G_v, grid; is_closed = false);
+θ, st = Lux.setup(rng, f_CNODE);
+```
+
+Short *burnout run* to get rid of the initial artifacts
+
+```julia
+trange_burn = (0.0f0, 50.0f0)
+dt, saveat = (1e-2, 1)
+full_CNODE = NeuralODE(f_CNODE,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+burnout_data = Array(full_CNODE(uv0, θ, st)[1]);
+```
+
+Second burnout with larger timesteps
+
+```julia
+trange_burn = (0.0f0, 100.0f0)
+dt, saveat = (1, 50)
+full_CNODE = NeuralODE(f_CNODE,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+burnout_data = Array(full_CNODE(burnout_data[:, :, end], θ, st)[1]);
+u = reshape(burnout_data[1:(grid.Nu), :, :], grid.nux, grid.nuy, size(burnout_data, 2), :);
+v = reshape(burnout_data[(grid.Nu + 1):end, :, :],
+    grid.nvx,
+    grid.nvy,
+    size(burnout_data, 2),
+    :);
+```
+
+**Data collection run**
+We use the output of the burnout to start a longer simulations
+
+```julia
+uv0 = burnout_data[:, :, end];
+trange = (0.0f0, 2500.0f0)
+```
+
+for this data production run, we set `dt=1` and we sample every step
+
+```julia
+dt, saveat = (0.1, 0.1)
+full_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat);
+reference_data = Array(full_CNODE(uv0, θ, st)[1]);
+```
+
+And we unpack the solution to get the two species from
+
+```julia
+u = reshape(reference_data[1:(grid.Nu), :, :],
+    grid.nux,
+    grid.nuy,
+    size(reference_data, 2),
+    :);
+v = reshape(reference_data[(grid.Nu + 1):end, :, :],
+    grid.nvx,
+    grid.nvy,
+    size(reference_data, 2),
+    :);
+```
+
+Plot the data
+
+```julia
+anim = Animation()
+fig = plot(layout = (3, 2), size = (600, 900))
+@gif for i in 1:1000:size(u, 4)
+    p1 = heatmap(u[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u(x,y)")
+    p2 = heatmap(v[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v(x,y)")
+    p3 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)
+    p4 = heatmap(v[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p5 = heatmap(u[:, :, 3, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)
+    p6 = heatmap(v[:, :, 3, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    time = round(i * saveat, digits = 0)
+    fig = plot(p1, p2, p3, p4, p5, p6, layout = (3, 2), plot_title = "time = $(time)")
+    frame(anim, fig)
+end
+if isdir("./plots")
+    gif(anim, "./plots/02.04-DNS.gif", fps = 10)
+else
+    gif(anim, "examples/plots/02.04-DNS.gif", fps = 10)
+end
+```
+
+### Collect the data for the coarse grid
+So now we redefine the grid parameters
+
+```julia
+nvx = nvy = 30
+dvx = nux * dux / nvx
+dvy = nuy * duy / nvy
+coarse_grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);
+```
+
+** Show what happens if you use a non-closed model on a smaller grid
+(we are not using any NN here)
+
+```julia
+f_coarse_CNODE = create_f_CNODE(F_u, G_v, coarse_grid; is_closed = false)
+θ, st = Lux.setup(rng, f_coarse_CNODE);
+uv0 = burnout_data[:, :, end];
+u0b = reshape(uv0[1:(grid.Nu), :], grid.nux, grid.nuy, :);
+v0b = reshape(uv0[(grid.Nu + 1):end, :], grid.nvx, grid.nvy, :);
+v0_coarse = imresize(v0b, (coarse_grid.nvx, coarse_grid.nvy));
+uv0_coarse = vcat(reshape(u0b, coarse_grid.Nu, :), reshape(v0_coarse, coarse_grid.Nv, :));
+closed_CNODE = NeuralODE(f_coarse_CNODE,
+    trange,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+closed_CNODE_solution = Array(closed_CNODE(uv0_coarse, θ, st)[1]);
+u_closed = reshape(closed_CNODE_solution[1:(coarse_grid.Nu), :, :],
+    coarse_grid.nux,
+    coarse_grid.nuy,
+    size(closed_CNODE_solution, 2),
+    :);
+v_closed = reshape(closed_CNODE_solution[(coarse_grid.Nu + 1):end, :, :],
+    coarse_grid.nvx,
+    coarse_grid.nvy,
+    size(closed_CNODE_solution, 2),
+    :);
+anim = Animation()
+fig = plot(layout = (3, 5), size = (500, 300))
+@gif for i in 1:1000:size(u_closed, 4)
+    p1 = heatmap(u_closed[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u(x,y) [C]")
+    p2 = heatmap(v_closed[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v(x,y) [C]")
+    p3 = heatmap(u[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u(x,y) [F]")
+    p4 = heatmap(v[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v(x,y) [F]")
+    e = u_closed[:, :, 1, i] .- u[:, :, 1, i]
+    p5 = heatmap(e,
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :greens,
+        title = "u-Diff")
+    p6 = heatmap(u_closed[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds)
+    p7 = heatmap(v_closed[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p8 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)
+    p9 = heatmap(v[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    e = u_closed[:, :, 2, i] .- u[:, :, 2, i]
+    p10 = heatmap(e,
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :greens,
+        title = "u-Diff")
+    p11 = heatmap(u_closed[:, :, 3, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds)
+    p12 = heatmap(v_closed[:, :, 3, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p13 = heatmap(u[:, :, 3, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds)
+    p14 = heatmap(v[:, :, 3, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    e = u_closed[:, :, 3, i] .- u[:, :, 3, i]
+    p15 = heatmap(e,
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :greens,
+        title = "u-Diff")
+    time = round(i * saveat, digits = 0)
+    fig = plot(p1,
+        p2,
+        p3,
+        p4,
+        p5,
+        p6,
+        p7,
+        p8,
+        p9,
+        p10,
+        p11,
+        p12,
+        p13,
+        p14,
+        p15,
+        layout = (3, 5),
+        plot_title = "time = $(time)")
+    frame(anim, fig)
+end
+if isdir("./plots")
+    gif(anim, "./plots/02.04-LES.gif", fps = 10)
+else
+    gif(anim, "examples/plots/02.04-LES.gif", fps = 10)
+end
+```
+
+In order to prepare the loss function, we compute from the simulation data the target that we would like to fit. In the example u will be unchanged, while v will be rescaled to the coarse grid
+
+```julia
+u_target = reshape(reference_data[1:(grid.Nu), :, :],
+    grid.nux,
+    grid.nuy,
+    size(reference_data, 2),
+    :);
+v = reshape(reference_data[(grid.Nu + 1):end, :, :],
+    grid.nvx,
+    grid.nvy,
+    size(reference_data, 2),
+    :);
+v_target = imresize(v, (coarse_grid.nvx, coarse_grid.nvy));
+```
+
+and pack them together in a target array where u and v are linearized in the first dimension
+
+```julia
+target = vcat(reshape(u_target, grid.Nu, size(u_target, 3), :),
+    reshape(v_target, coarse_grid.Nv, size(v_target, 3), :));
+```
+
+and make it into float32
+
+```julia
+target = target |> f32;
+```
+
+Now create the CNODE with the Neural Network
+
+```julia
+ch_fno = [2, 5, 5, 5, 2];
+kmax_fno = [8, 8, 8, 8];
+σ_fno = [gelu, gelu, gelu, identity];
+NN_u = create_fno_model(kmax_fno, ch_fno, σ_fno);
+NN_v = create_fno_model(kmax_fno, ch_fno, σ_fno)
+```
+
+We can now close the CNODE with the Neural Network
+
+```julia
+f_closed_CNODE = create_f_CNODE(F_u, G_v, coarse_grid, NN_u, NN_v; is_closed = true)
+```
+
+and make it into float32
+
+```julia
+f_closed_CNODE = f_closed_CNODE |> f32;
+θ, st = Lux.setup(rng, f_closed_CNODE);
+```
+
+### Design the **loss function**
+For this example, we use *multishooting a posteriori* fitting (MulDtO), where we tell `Zygote` to compare `nintervals` of length `nunroll` to get the gradient. Notice that this method is differentiating through the solution of the NODE!
+
+```julia
+nunroll = 5
+nintervals = 20
+nsamples = 2
+```
+
+We also define this auxiliary NODE that will be used for training
+We can use smaller time steps for the training
+
+```julia
+dt_train = 0.05
+```
+
+but we have to sample at the same rate as the data
+
+```julia
+saveat_train = saveat
+t_train_range = (0.0f0, saveat_train * (nunroll + 0)) # it has to be as long as unroll
+training_CNODE = NeuralODE(f_closed_CNODE,
+    t_train_range,
+    Tsit5(),
+    adaptive = false,
+    dt = dt_train,
+    saveat = saveat_train);
+```
+
+* Create the loss
+
+```julia
+myloss = create_randloss_MulDtO(target,
+    nunroll = nunroll,
+    nintervals = nintervals,
+    nsamples = nsamples,
+    λ_c = 1e2,
+    λ_l1 = 1e-1);
+```
+
+To initialize the training, we need some objects to monitor the procedure, and we trigger the first compilation.
+
+```julia
+lhist = Float32[];
+```
+
+Initialize and trigger the compilation of the model
+
+```julia
+pinit = ComponentArray(θ);
+myloss(pinit)  # trigger compilation
+```
+
+[!] Check that the loss does not get type warnings, otherwise it will be slower
+
+Select the autodifferentiation type
+
+```julia
+adtype = Optimization.AutoZygote();
+```
+
+We transform the NeuralODE into an optimization problem
+
+```julia
+optf = Optimization.OptimizationFunction((x, p) -> myloss(x), adtype);
+optprob = Optimization.OptimizationProblem(optf, pinit);
+```
+
+Select the training algorithm:
+we choose Adam with learning rate 0.1, with gradient clipping
+
+```julia
+ClipAdam = OptimiserChain(Adam(1.0f-2), ClipGrad(1));
+```
+
+Finally we can train the NODE
+
+```julia
+result_neuralode = Optimization.solve(optprob,
+    ClipAdam;
+    callback = callback,
+    maxiters = 3);
+pinit = result_neuralode.u;
+θ = pinit;
+optprob = Optimization.OptimizationProblem(optf, pinit);
+```
+
+(Notice that the block above can be repeated to continue training)
+
+**Notice** that the training is rather slow, so realistically here you can not expect good results in a few iterations.
+
+and finally I use the trained CNODE to compare the solution with the target
+
+```julia
+trange = (0.0f0, 300.0f0)
+trained_CNODE = NeuralODE(f_closed_CNODE,
+    trange,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+trained_CNODE_solution = Array(trained_CNODE(uv0_coarse[:, 1:3], θ, st)[1]);
+u_trained = reshape(trained_CNODE_solution[1:(coarse_grid.Nu), :, :],
+    coarse_grid.nux,
+    coarse_grid.nuy,
+    size(trained_CNODE_solution, 2),
+    :);
+v_trained = reshape(trained_CNODE_solution[(coarse_grid.Nu + 1):end, :, :],
+    coarse_grid.nvx,
+    coarse_grid.nvy,
+    size(trained_CNODE_solution, 2),
+    :);
+anim = Animation()
+fig = plot(layout = (2, 5), size = (750, 300))
+@gif for i in 1:40:size(u_trained, 4)
+    p1 = heatmap(u[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Exact")
+    p2 = heatmap(v[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p3 = heatmap(u_trained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Trained")
+    p4 = heatmap(v_trained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    et = abs.(u[:, :, 1, i] .- u_trained[:, :, 1, i])
+    p5 = heatmap(et,
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :greens,
+        title = "Diff-u")
+    p6 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)
+    p7 = heatmap(v[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p8 = heatmap(u_trained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds)
+    p9 = heatmap(v_trained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    e = abs.(u[:, :, 2, i] .- u_trained[:, :, 2, i])
+    p10 = heatmap(e, axis = false, cbar = false, aspect_ratio = 1, color = :greens)
+
+    time = round(i * saveat, digits = 0)
+    fig = plot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, layout = (2, 5), margin = 0mm)
+
+    frame(anim, fig)
+end
+if isdir("./plots")
+    gif(anim, "./plots/02.04-NNclosure.gif", fps = 10)
+else
+    gif(anim, "examples/plots/02.04-NNclosure.gif", fps = 10)
+end
+```
+
+---
+
+*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
+
diff --git a/examples/TODO_02.04-GrayScott_alternative.ipynb b/examples/TODO_02.04-GrayScott_alternative.ipynb
new file mode 100644
index 0000000..8fb8402
--- /dev/null
+++ b/examples/TODO_02.04-GrayScott_alternative.ipynb
@@ -0,0 +1,941 @@
+{
+ "cells": [
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using Lux\n",
+    "using SciMLSensitivity\n",
+    "using DiffEqFlux\n",
+    "using DifferentialEquations\n",
+    "using Plots\n",
+    "using Plots.PlotMeasures\n",
+    "using Zygote\n",
+    "using Random\n",
+    "rng = Random.seed!(1234)\n",
+    "using OptimizationOptimisers\n",
+    "using Statistics\n",
+    "using ComponentArrays\n",
+    "using CUDA\n",
+    "using Images\n",
+    "using Interpolations\n",
+    "using NNlib\n",
+    "using FFTW\n",
+    "ArrayType = CUDA.functional() ? CuArray : Array;\n",
+    "# Import our custom backend functions\n",
+    "include(\"coupling_functions/functions_example.jl\")\n",
+    "include(\"coupling_functions/functions_NODE.jl\")\n",
+    "include(\"coupling_functions/functions_CNODE_loss.jl\")\n",
+    "include(\"coupling_functions/functions_FDderivatives.jl\");\n",
+    "include(\"coupling_functions/functions_nn.jl\")\n",
+    "include(\"coupling_functions/functions_FNO.jl\")"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Learning the Gray-Scott model (alternative)"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In this example, we want to learn a closure to the Gray-Scott model using a Neural Network. We will use the same parameters as in the previous example, but we will use a smaller grid to train the closure. **Compared to `Example2_part2` here we use a coarser grid on both u and v**."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We run multiple GS simulations as discussed in the previous part.\n",
+    "Notice that the 'fine' grid is now only 40 cells per side, in order to speed up the example"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dux = duy = dvx = dvy = 1.0\n",
+    "nux = nuy = nvx = nvy = 40\n",
+    "grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Here, we define the initial condition as a random perturbation over a constant background to add variety"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "function initial_condition(grid, U₀, V₀, ε_u, ε_v; nsimulations = 1)\n",
+    "    u_init = U₀ .+ ε_u .* randn(grid.nux, grid.nuy, nsimulations)\n",
+    "    v_init = V₀ .+ ε_v .* randn(grid.nvx, grid.nvy, nsimulations)\n",
+    "    return u_init, v_init\n",
+    "end\n",
+    "U₀ = 0.5    # initial concentration of u\n",
+    "V₀ = 0.25   # initial concentration of v\n",
+    "ε_u = 0.05 # magnitude of the perturbation on u\n",
+    "ε_v = 0.1 # magnitude of the perturbation on v\n",
+    "u_initial, v_initial = initial_condition(grid, U₀, V₀, ε_u, ε_v, nsimulations = 4);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can now define the initial condition as a flattened concatenated array"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = vcat(reshape(u_initial, grid.nux * grid.nuy, :),\n",
+    "    reshape(v_initial, grid.nvx * grid.nvy, :));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "From the literature, we have selected the following parameters in order to form nice patterns"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "D_u = 0.16\n",
+    "D_v = 0.08\n",
+    "f = 0.055\n",
+    "k = 0.062;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "RHS of GS model"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "F_u(u, v, grid) = D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)\n",
+    "G_v(u, v, grid) = D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2 .- (f + k) .* v"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and definition of the model"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_CNODE = create_f_CNODE(F_u, G_v, grid; is_closed = false);\n",
+    "θ, st = Lux.setup(rng, f_CNODE);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Short *burnout run* to get rid of the initial artifacts"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trange_burn = (0.0f0, 50.0f0)\n",
+    "dt, saveat = (1e-2, 1)\n",
+    "full_CNODE = NeuralODE(f_CNODE,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "burnout_data = Array(full_CNODE(uv0, θ, st)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Second burnout with larger timesteps"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trange_burn = (0.0f0, 100.0f0)\n",
+    "dt, saveat = (1, 50)\n",
+    "full_CNODE = NeuralODE(f_CNODE,\n",
+    "    trange_burn,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "burnout_data = Array(full_CNODE(burnout_data[:, :, end], θ, st)[1]);\n",
+    "u = reshape(burnout_data[1:(grid.Nu), :, :], grid.nux, grid.nuy, size(burnout_data, 2), :);\n",
+    "v = reshape(burnout_data[(grid.Nu + 1):end, :, :],\n",
+    "    grid.nvx,\n",
+    "    grid.nvy,\n",
+    "    size(burnout_data, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "**Data collection run**\n",
+    "We use the output of the burnout to start a longer simulations"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "uv0 = burnout_data[:, :, end]\n",
+    "trange = (0.0f0, 2500.0f0)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "for this data production run, we set `dt=1` and we sample every step"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dt, saveat = (0.1, 0.1)\n",
+    "full_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat);\n",
+    "reference_data = Array(full_CNODE(uv0, θ, st)[1]);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "And we unpack the solution to get the two species from"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "u = reshape(reference_data[1:(grid.Nu), :, :],\n",
+    "    grid.nux,\n",
+    "    grid.nuy,\n",
+    "    size(reference_data, 2),\n",
+    "    :);\n",
+    "v = reshape(reference_data[(grid.Nu + 1):end, :, :],\n",
+    "    grid.nvx,\n",
+    "    grid.nvy,\n",
+    "    size(reference_data, 2),\n",
+    "    :);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Plot the data"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "anim = Animation()\n",
+    "fig = plot(layout = (3, 2), size = (600, 900))\n",
+    "@gif for i in 1:1000:size(u, 4)\n",
+    "    p1 = heatmap(u[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u(x,y)\")\n",
+    "    p2 = heatmap(v[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v(x,y)\")\n",
+    "    p3 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)\n",
+    "    p4 = heatmap(v[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p5 = heatmap(u[:, :, 3, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)\n",
+    "    p6 = heatmap(v[:, :, 3, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    time = round(i * saveat, digits = 0)\n",
+    "    fig = plot(p1, p2, p3, p4, p5, p6, layout = (3, 2), plot_title = \"time = $(time)\")\n",
+    "    frame(anim, fig)\n",
+    "end\n",
+    "if isdir(\"./plots\")\n",
+    "    gif(anim, \"./plots/multi_GS.gif\", fps = 10)\n",
+    "else\n",
+    "    gif(anim, \"examples/plots/multi_GS.gif\", fps = 10)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Collect the data for the coarse grid\n",
+    "So now we redefine the grid parameters"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "nvx = nvy = nux = nuy = 30\n",
+    "dvx = 40 * dux / nvx\n",
+    "dvy = 40 * duy / nvy\n",
+    "dux = 40 * dux / nvx\n",
+    "duy = 40 * duy / nvy\n",
+    "coarse_grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Here we show what happens if you use a non-closed model on the coarse grid of this example"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_coarse_CNODE = create_f_CNODE(F_u, G_v, coarse_grid; is_closed = false)\n",
+    "θ, st = Lux.setup(rng, f_coarse_CNODE);\n",
+    "uv0 = burnout_data[:, :, end];\n",
+    "u0b = reshape(uv0[1:(grid.nux * grid.nuy), :], grid.nux, grid.nuy, :);\n",
+    "v0b = reshape(uv0[(grid.nux * grid.nuy + 1):end, :], grid.nvx, grid.nvy, :);\n",
+    "u0_coarse = imresize(u0b, (coarse_grid.nux, coarse_grid.nuy));\n",
+    "v0_coarse = imresize(v0b, (coarse_grid.nvx, coarse_grid.nvy));\n",
+    "uv0_coarse = vcat(reshape(u0_coarse, coarse_grid.nux * coarse_grid.nuy, :),\n",
+    "    reshape(v0_coarse, coarse_grid.nvx * coarse_grid.nvy, :));\n",
+    "closed_CNODE = NeuralODE(f_coarse_CNODE,\n",
+    "    trange,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "closed_CNODE_solution = Array(closed_CNODE(uv0_coarse, θ, st)[1]);\n",
+    "u_closed = reshape(closed_CNODE_solution[1:(coarse_grid.nux * coarse_grid.nuy), :, :],\n",
+    "    coarse_grid.nux,\n",
+    "    coarse_grid.nuy,\n",
+    "    size(closed_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v_closed = reshape(\n",
+    "    closed_CNODE_solution[(coarse_grid.nux * coarse_grid.nuy + 1):end, :, :],\n",
+    "    coarse_grid.nvx,\n",
+    "    coarse_grid.nvy,\n",
+    "    size(closed_CNODE_solution, 2),\n",
+    "    :);\n",
+    "anim = Animation()\n",
+    "fig = plot(layout = (3, 5), size = (500, 300))\n",
+    "@gif for i in 1:1000:size(u_closed, 4)\n",
+    "    p1 = heatmap(u_closed[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u(x,y) [C]\")\n",
+    "    p2 = heatmap(v_closed[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v(x,y) [C]\")\n",
+    "    p3 = heatmap(u[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"u(x,y) [F]\")\n",
+    "    p4 = heatmap(v[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues,\n",
+    "        title = \"v(x,y) [F]\")\n",
+    "    e = u_closed[:, :, 1, i] .- imresize(u[:, :, 1, i], coarse_grid.nux, coarse_grid.nuy)\n",
+    "    p5 = heatmap(e,\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :greens,\n",
+    "        title = \"u-Diff\")\n",
+    "    p6 = heatmap(u_closed[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds)\n",
+    "    p7 = heatmap(v_closed[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p8 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)\n",
+    "    p9 = heatmap(v[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    e = u_closed[:, :, 2, i] .- imresize(u[:, :, 2, i], coarse_grid.nux, coarse_grid.nuy)\n",
+    "    p10 = heatmap(e,\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :greens,\n",
+    "        title = \"u-Diff\")\n",
+    "    p11 = heatmap(u_closed[:, :, 3, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds)\n",
+    "    p12 = heatmap(v_closed[:, :, 3, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p13 = heatmap(u[:, :, 3, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds)\n",
+    "    p14 = heatmap(v[:, :, 3, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    e = u_closed[:, :, 3, i] .- imresize(u[:, :, 3, i], coarse_grid.nux, coarse_grid.nuy)\n",
+    "    p15 = heatmap(e,\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :greens,\n",
+    "        title = \"u-Diff\")\n",
+    "    time = round(i * saveat, digits = 0)\n",
+    "    fig = plot(p1,\n",
+    "        p2,\n",
+    "        p3,\n",
+    "        p4,\n",
+    "        p5,\n",
+    "        p6,\n",
+    "        p7,\n",
+    "        p8,\n",
+    "        p9,\n",
+    "        p10,\n",
+    "        p11,\n",
+    "        p12,\n",
+    "        p13,\n",
+    "        p14,\n",
+    "        p15,\n",
+    "        layout = (3, 5),\n",
+    "        plot_title = \"time = $(time)\")\n",
+    "    frame(anim, fig)\n",
+    "end\n",
+    "if isdir(\"./plots\")\n",
+    "    gif(anim, \"./plots/multi_GS_coarse_alternative.gif\", fps = 10)\n",
+    "else\n",
+    "    gif(anim, \"examples/plots/multi_GS_coarse_alternative.gif\", fps = 10)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "In order to prepare the loss function, we compute from the simulation data the target that we would like to fit. In the example u will be unchanged, while v will be rescaled to the coarse grid"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "u = reshape(reference_data[1:(grid.nux * grid.nuy), :, :],\n",
+    "    grid.nux,\n",
+    "    grid.nuy,\n",
+    "    size(reference_data, 2),\n",
+    "    :);\n",
+    "v = reshape(reference_data[(grid.nux * grid.nuy + 1):end, :, :],\n",
+    "    grid.nvx,\n",
+    "    grid.nvy,\n",
+    "    size(reference_data, 2),\n",
+    "    :);\n",
+    "u_target = imresize(u, (coarse_grid.nvx, coarse_grid.nvy));\n",
+    "v_target = imresize(v, (coarse_grid.nvx, coarse_grid.nvy));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and pack them together in a target array where u and v are linearized in the first dimension"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "target = vcat(reshape(u_target, coarse_grid.nux * coarse_grid.nuy, size(u_target, 3), :),\n",
+    "    reshape(v_target, coarse_grid.nvx * coarse_grid.nvy, size(v_target, 3), :));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and make it into float32"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "target = target |> f32;"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Now create the CNODE with the Neural Network"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "ch_fno = [2, 5, 5, 5, 2];\n",
+    "kmax_fno = [8, 8, 8, 8];\n",
+    "σ_fno = [gelu, gelu, gelu, identity];\n",
+    "NN_u = create_fno_model(kmax_fno, ch_fno, σ_fno);\n",
+    "NN_v = create_fno_model(kmax_fno, ch_fno, σ_fno);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can now close the CNODE with the Neural Network"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_closed_CNODE = create_f_CNODE(F_u, G_v, coarse_grid, NN_u, NN_v; is_closed = true);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and make it into float32"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_closed_CNODE = f_closed_CNODE |> f32;\n",
+    "θ, st = Lux.setup(rng, f_closed_CNODE);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Design the **loss function**\n",
+    "For this example, we use *multishooting a posteriori* fitting (MulDtO), where we tell `Zygote` to compare `nintervals` of length `nunroll` to get the gradient. Notice that this method is differentiating through the solution of the NODE!"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "nunroll = 5\n",
+    "nintervals = 5\n",
+    "nsamples = 2"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We also define this auxiliary NODE that will be used for training\n",
+    "We can use smaller time steps for the training because the untrained parameters will cause unstability"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dt_train = 0.05"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "but we have to sample at the same rate as the data"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "saveat_train = saveat\n",
+    "t_train_range = (0.0f0, saveat_train * (nunroll + 0)) # it has to be as long as unroll\n",
+    "training_CNODE = NeuralODE(f_closed_CNODE,\n",
+    "    t_train_range,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt_train,\n",
+    "    saveat = saveat_train);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "* Create the loss"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "myloss = create_randloss_MulDtO(target,\n",
+    "    nunroll = nunroll,\n",
+    "    nintervals = nintervals,\n",
+    "    nsamples = nsamples,\n",
+    "    λ = 0.1);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "To initialize the training, we need some objects to monitor the procedure, and we trigger the first compilation."
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "lhist = Float32[];"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Initialize and trigger the compilation of the model"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "pinit = ComponentArray(θ);\n",
+    "myloss(pinit)  # trigger compilation"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "[!] Check that the loss does not get type warnings, otherwise it will be slower"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Select the autodifferentiation type"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "adtype = Optimization.AutoZygote();"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We transform the NeuralODE into an optimization problem"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "optf = Optimization.OptimizationFunction((x, p) -> myloss(x), adtype);\n",
+    "optprob = Optimization.OptimizationProblem(optf, pinit);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Select the training algorithm:\n",
+    "we choose Adam with learning rate 0.01, with gradient clipping"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "ClipAdam = OptimiserChain(Adam(1.0f-2), ClipGrad(1));"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Finally we can train the NODE"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "result_neuralode = Optimization.solve(optprob,\n",
+    "    ClipAdam;\n",
+    "    callback = callback,\n",
+    "    maxiters = 3);\n",
+    "pinit = result_neuralode.u;\n",
+    "optprob = Optimization.OptimizationProblem(optf, pinit);"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "(Notice that the block above can be repeated to continue training)"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "**Notice** that the training is rather slow, so realistically here you can not expect good results in a few iterations."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and finally I use the trained CNODE to compare the solution with the target"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "trange = (0.0f0, 300.0f0)\n",
+    "trained_CNODE = NeuralODE(f_closed_CNODE,\n",
+    "    trange,\n",
+    "    Tsit5(),\n",
+    "    adaptive = false,\n",
+    "    dt = dt,\n",
+    "    saveat = saveat);\n",
+    "trained_CNODE_solution = Array(trained_CNODE(uv0_coarse[:, 1:3], θ, st)[1]);\n",
+    "u_trained = reshape(trained_CNODE_solution[1:(coarse_grid.Nu), :, :],\n",
+    "    coarse_grid.nux,\n",
+    "    coarse_grid.nuy,\n",
+    "    size(trained_CNODE_solution, 2),\n",
+    "    :);\n",
+    "v_trained = reshape(trained_CNODE_solution[(coarse_grid.Nu + 1):end, :, :],\n",
+    "    coarse_grid.nvx,\n",
+    "    coarse_grid.nvy,\n",
+    "    size(trained_CNODE_solution, 2),\n",
+    "    :);\n",
+    "anim = Animation()\n",
+    "fig = plot(layout = (2, 5), size = (750, 300))\n",
+    "@gif for i in 1:40:size(u_trained, 4)\n",
+    "    p1 = heatmap(u[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Exact\")\n",
+    "    p2 = heatmap(v[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p3 = heatmap(u_trained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds,\n",
+    "        title = \"Trained\")\n",
+    "    p4 = heatmap(v_trained[:, :, 1, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    et = abs.(imresize(u[:, :, 1, i], coarse_grid.nux, coarse_grid.nuy) .-\n",
+    "              u_trained[:, :, 1, i])\n",
+    "    p5 = heatmap(et,\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :greens,\n",
+    "        title = \"Diff-u\")\n",
+    "    p6 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)\n",
+    "    p7 = heatmap(v[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    p8 = heatmap(u_trained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :reds)\n",
+    "    p9 = heatmap(v_trained[:, :, 2, i],\n",
+    "        axis = false,\n",
+    "        cbar = false,\n",
+    "        aspect_ratio = 1,\n",
+    "        color = :blues)\n",
+    "    e = abs.(imresize(u[:, :, 2, i], coarse_grid.nux, coarse_grid.nuy) .-\n",
+    "             u_trained[:, :, 2, i])\n",
+    "    p10 = heatmap(e, axis = false, cbar = false, aspect_ratio = 1, color = :greens)\n",
+    "\n",
+    "    time = round(i * saveat, digits = 0)\n",
+    "    fig = plot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, layout = (2, 5), margin = 0mm)\n",
+    "\n",
+    "    frame(anim, fig)\n",
+    "end\n",
+    "if isdir(\"./plots\")\n",
+    "    gif(anim, \"./plots/trained_GS.gif\", fps = 10)\n",
+    "else\n",
+    "    gif(anim, \"examples/plots/trained_GS.gif\", fps = 10)\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "\n",
+    "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
+   ],
+   "metadata": {}
+  }
+ ],
+ "nbformat_minor": 3,
+ "metadata": {
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.10.2"
+  },
+  "kernelspec": {
+   "name": "julia-1.10",
+   "display_name": "Julia 1.10.2",
+   "language": "julia"
+  }
+ },
+ "nbformat": 4
+}
diff --git a/examples/TODO_02.04-GrayScott_alternative.md b/examples/TODO_02.04-GrayScott_alternative.md
new file mode 100644
index 0000000..f382b0d
--- /dev/null
+++ b/examples/TODO_02.04-GrayScott_alternative.md
@@ -0,0 +1,577 @@
+```julia
+using Lux
+using SciMLSensitivity
+using DiffEqFlux
+using DifferentialEquations
+using Plots
+using Plots.PlotMeasures
+using Zygote
+using Random
+rng = Random.seed!(1234)
+using OptimizationOptimisers
+using Statistics
+using ComponentArrays
+using CUDA
+using Images
+using Interpolations
+using NNlib
+using FFTW
+ArrayType = CUDA.functional() ? CuArray : Array;
+# Import our custom backend functions
+include("coupling_functions/functions_example.jl")
+include("coupling_functions/functions_NODE.jl")
+include("coupling_functions/functions_CNODE_loss.jl")
+include("coupling_functions/functions_FDderivatives.jl");
+include("coupling_functions/functions_nn.jl")
+include("coupling_functions/functions_FNO.jl")
+```
+
+## Learning the Gray-Scott model (alternative)
+
+In this example, we want to learn a closure to the Gray-Scott model using a Neural Network. We will use the same parameters as in the previous example, but we will use a smaller grid to train the closure. **Compared to `Example2_part2` here we use a coarser grid on both u and v**.
+
+We run multiple GS simulations as discussed in the previous part.
+Notice that the 'fine' grid is now only 40 cells per side, in order to speed up the example
+
+```julia
+dux = duy = dvx = dvy = 1.0
+nux = nuy = nvx = nvy = 40
+grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);
+```
+
+Here, we define the initial condition as a random perturbation over a constant background to add variety
+
+```julia
+function initial_condition(grid, U₀, V₀, ε_u, ε_v; nsimulations = 1)
+    u_init = U₀ .+ ε_u .* randn(grid.nux, grid.nuy, nsimulations)
+    v_init = V₀ .+ ε_v .* randn(grid.nvx, grid.nvy, nsimulations)
+    return u_init, v_init
+end
+U₀ = 0.5    # initial concentration of u
+V₀ = 0.25   # initial concentration of v
+ε_u = 0.05 # magnitude of the perturbation on u
+ε_v = 0.1 # magnitude of the perturbation on v
+u_initial, v_initial = initial_condition(grid, U₀, V₀, ε_u, ε_v, nsimulations = 4);
+```
+
+We can now define the initial condition as a flattened concatenated array
+
+```julia
+uv0 = vcat(reshape(u_initial, grid.nux * grid.nuy, :),
+    reshape(v_initial, grid.nvx * grid.nvy, :));
+```
+
+From the literature, we have selected the following parameters in order to form nice patterns
+
+```julia
+D_u = 0.16
+D_v = 0.08
+f = 0.055
+k = 0.062;
+```
+
+RHS of GS model
+
+```julia
+F_u(u, v, grid) = D_u * Laplacian(u, grid.dux, grid.duy) .- u .* v .^ 2 .+ f .* (1.0 .- u)
+G_v(u, v, grid) = D_v * Laplacian(v, grid.dvx, grid.dvy) .+ u .* v .^ 2 .- (f + k) .* v
+```
+
+and definition of the model
+
+```julia
+f_CNODE = create_f_CNODE(F_u, G_v, grid; is_closed = false);
+θ, st = Lux.setup(rng, f_CNODE);
+```
+
+Short *burnout run* to get rid of the initial artifacts
+
+```julia
+trange_burn = (0.0f0, 50.0f0)
+dt, saveat = (1e-2, 1)
+full_CNODE = NeuralODE(f_CNODE,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+burnout_data = Array(full_CNODE(uv0, θ, st)[1]);
+```
+
+Second burnout with larger timesteps
+
+```julia
+trange_burn = (0.0f0, 100.0f0)
+dt, saveat = (1, 50)
+full_CNODE = NeuralODE(f_CNODE,
+    trange_burn,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+burnout_data = Array(full_CNODE(burnout_data[:, :, end], θ, st)[1]);
+u = reshape(burnout_data[1:(grid.Nu), :, :], grid.nux, grid.nuy, size(burnout_data, 2), :);
+v = reshape(burnout_data[(grid.Nu + 1):end, :, :],
+    grid.nvx,
+    grid.nvy,
+    size(burnout_data, 2),
+    :);
+```
+
+**Data collection run**
+We use the output of the burnout to start a longer simulations
+
+```julia
+uv0 = burnout_data[:, :, end]
+trange = (0.0f0, 2500.0f0)
+```
+
+for this data production run, we set `dt=1` and we sample every step
+
+```julia
+dt, saveat = (0.1, 0.1)
+full_CNODE = NeuralODE(f_CNODE, trange, Tsit5(), adaptive = false, dt = dt, saveat = saveat);
+reference_data = Array(full_CNODE(uv0, θ, st)[1]);
+```
+
+And we unpack the solution to get the two species from
+
+```julia
+u = reshape(reference_data[1:(grid.Nu), :, :],
+    grid.nux,
+    grid.nuy,
+    size(reference_data, 2),
+    :);
+v = reshape(reference_data[(grid.Nu + 1):end, :, :],
+    grid.nvx,
+    grid.nvy,
+    size(reference_data, 2),
+    :);
+```
+
+Plot the data
+
+```julia
+anim = Animation()
+fig = plot(layout = (3, 2), size = (600, 900))
+@gif for i in 1:1000:size(u, 4)
+    p1 = heatmap(u[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u(x,y)")
+    p2 = heatmap(v[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v(x,y)")
+    p3 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)
+    p4 = heatmap(v[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p5 = heatmap(u[:, :, 3, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)
+    p6 = heatmap(v[:, :, 3, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    time = round(i * saveat, digits = 0)
+    fig = plot(p1, p2, p3, p4, p5, p6, layout = (3, 2), plot_title = "time = $(time)")
+    frame(anim, fig)
+end
+if isdir("./plots")
+    gif(anim, "./plots/multi_GS.gif", fps = 10)
+else
+    gif(anim, "examples/plots/multi_GS.gif", fps = 10)
+end
+```
+
+### Collect the data for the coarse grid
+So now we redefine the grid parameters
+
+```julia
+nvx = nvy = nux = nuy = 30
+dvx = 40 * dux / nvx
+dvy = 40 * duy / nvy
+dux = 40 * dux / nvx
+duy = 40 * duy / nvy
+coarse_grid = Grid(dux, duy, nux, nuy, dvx, dvy, nvx, nvy);
+```
+
+Here we show what happens if you use a non-closed model on the coarse grid of this example
+
+```julia
+f_coarse_CNODE = create_f_CNODE(F_u, G_v, coarse_grid; is_closed = false)
+θ, st = Lux.setup(rng, f_coarse_CNODE);
+uv0 = burnout_data[:, :, end];
+u0b = reshape(uv0[1:(grid.nux * grid.nuy), :], grid.nux, grid.nuy, :);
+v0b = reshape(uv0[(grid.nux * grid.nuy + 1):end, :], grid.nvx, grid.nvy, :);
+u0_coarse = imresize(u0b, (coarse_grid.nux, coarse_grid.nuy));
+v0_coarse = imresize(v0b, (coarse_grid.nvx, coarse_grid.nvy));
+uv0_coarse = vcat(reshape(u0_coarse, coarse_grid.nux * coarse_grid.nuy, :),
+    reshape(v0_coarse, coarse_grid.nvx * coarse_grid.nvy, :));
+closed_CNODE = NeuralODE(f_coarse_CNODE,
+    trange,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+closed_CNODE_solution = Array(closed_CNODE(uv0_coarse, θ, st)[1]);
+u_closed = reshape(closed_CNODE_solution[1:(coarse_grid.nux * coarse_grid.nuy), :, :],
+    coarse_grid.nux,
+    coarse_grid.nuy,
+    size(closed_CNODE_solution, 2),
+    :);
+v_closed = reshape(
+    closed_CNODE_solution[(coarse_grid.nux * coarse_grid.nuy + 1):end, :, :],
+    coarse_grid.nvx,
+    coarse_grid.nvy,
+    size(closed_CNODE_solution, 2),
+    :);
+anim = Animation()
+fig = plot(layout = (3, 5), size = (500, 300))
+@gif for i in 1:1000:size(u_closed, 4)
+    p1 = heatmap(u_closed[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u(x,y) [C]")
+    p2 = heatmap(v_closed[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v(x,y) [C]")
+    p3 = heatmap(u[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "u(x,y) [F]")
+    p4 = heatmap(v[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues,
+        title = "v(x,y) [F]")
+    e = u_closed[:, :, 1, i] .- imresize(u[:, :, 1, i], coarse_grid.nux, coarse_grid.nuy)
+    p5 = heatmap(e,
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :greens,
+        title = "u-Diff")
+    p6 = heatmap(u_closed[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds)
+    p7 = heatmap(v_closed[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p8 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)
+    p9 = heatmap(v[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    e = u_closed[:, :, 2, i] .- imresize(u[:, :, 2, i], coarse_grid.nux, coarse_grid.nuy)
+    p10 = heatmap(e,
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :greens,
+        title = "u-Diff")
+    p11 = heatmap(u_closed[:, :, 3, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds)
+    p12 = heatmap(v_closed[:, :, 3, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p13 = heatmap(u[:, :, 3, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds)
+    p14 = heatmap(v[:, :, 3, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    e = u_closed[:, :, 3, i] .- imresize(u[:, :, 3, i], coarse_grid.nux, coarse_grid.nuy)
+    p15 = heatmap(e,
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :greens,
+        title = "u-Diff")
+    time = round(i * saveat, digits = 0)
+    fig = plot(p1,
+        p2,
+        p3,
+        p4,
+        p5,
+        p6,
+        p7,
+        p8,
+        p9,
+        p10,
+        p11,
+        p12,
+        p13,
+        p14,
+        p15,
+        layout = (3, 5),
+        plot_title = "time = $(time)")
+    frame(anim, fig)
+end
+if isdir("./plots")
+    gif(anim, "./plots/multi_GS_coarse_alternative.gif", fps = 10)
+else
+    gif(anim, "examples/plots/multi_GS_coarse_alternative.gif", fps = 10)
+end
+```
+
+In order to prepare the loss function, we compute from the simulation data the target that we would like to fit. In the example u will be unchanged, while v will be rescaled to the coarse grid
+
+```julia
+u = reshape(reference_data[1:(grid.nux * grid.nuy), :, :],
+    grid.nux,
+    grid.nuy,
+    size(reference_data, 2),
+    :);
+v = reshape(reference_data[(grid.nux * grid.nuy + 1):end, :, :],
+    grid.nvx,
+    grid.nvy,
+    size(reference_data, 2),
+    :);
+u_target = imresize(u, (coarse_grid.nvx, coarse_grid.nvy));
+v_target = imresize(v, (coarse_grid.nvx, coarse_grid.nvy));
+```
+
+and pack them together in a target array where u and v are linearized in the first dimension
+
+```julia
+target = vcat(reshape(u_target, coarse_grid.nux * coarse_grid.nuy, size(u_target, 3), :),
+    reshape(v_target, coarse_grid.nvx * coarse_grid.nvy, size(v_target, 3), :));
+```
+
+and make it into float32
+
+```julia
+target = target |> f32;
+```
+
+Now create the CNODE with the Neural Network
+
+```julia
+ch_fno = [2, 5, 5, 5, 2];
+kmax_fno = [8, 8, 8, 8];
+σ_fno = [gelu, gelu, gelu, identity];
+NN_u = create_fno_model(kmax_fno, ch_fno, σ_fno);
+NN_v = create_fno_model(kmax_fno, ch_fno, σ_fno);
+```
+
+We can now close the CNODE with the Neural Network
+
+```julia
+f_closed_CNODE = create_f_CNODE(F_u, G_v, coarse_grid, NN_u, NN_v; is_closed = true);
+```
+
+and make it into float32
+
+```julia
+f_closed_CNODE = f_closed_CNODE |> f32;
+θ, st = Lux.setup(rng, f_closed_CNODE);
+```
+
+### Design the **loss function**
+For this example, we use *multishooting a posteriori* fitting (MulDtO), where we tell `Zygote` to compare `nintervals` of length `nunroll` to get the gradient. Notice that this method is differentiating through the solution of the NODE!
+
+```julia
+nunroll = 5
+nintervals = 5
+nsamples = 2
+```
+
+We also define this auxiliary NODE that will be used for training
+We can use smaller time steps for the training because the untrained parameters will cause unstability
+
+```julia
+dt_train = 0.05
+```
+
+but we have to sample at the same rate as the data
+
+```julia
+saveat_train = saveat
+t_train_range = (0.0f0, saveat_train * (nunroll + 0)) # it has to be as long as unroll
+training_CNODE = NeuralODE(f_closed_CNODE,
+    t_train_range,
+    Tsit5(),
+    adaptive = false,
+    dt = dt_train,
+    saveat = saveat_train);
+```
+
+* Create the loss
+
+```julia
+myloss = create_randloss_MulDtO(target,
+    nunroll = nunroll,
+    nintervals = nintervals,
+    nsamples = nsamples,
+    λ = 0.1);
+```
+
+To initialize the training, we need some objects to monitor the procedure, and we trigger the first compilation.
+
+```julia
+lhist = Float32[];
+```
+
+Initialize and trigger the compilation of the model
+
+```julia
+pinit = ComponentArray(θ);
+myloss(pinit)  # trigger compilation
+```
+
+[!] Check that the loss does not get type warnings, otherwise it will be slower
+
+Select the autodifferentiation type
+
+```julia
+adtype = Optimization.AutoZygote();
+```
+
+We transform the NeuralODE into an optimization problem
+
+```julia
+optf = Optimization.OptimizationFunction((x, p) -> myloss(x), adtype);
+optprob = Optimization.OptimizationProblem(optf, pinit);
+```
+
+Select the training algorithm:
+we choose Adam with learning rate 0.01, with gradient clipping
+
+```julia
+ClipAdam = OptimiserChain(Adam(1.0f-2), ClipGrad(1));
+```
+
+Finally we can train the NODE
+
+```julia
+result_neuralode = Optimization.solve(optprob,
+    ClipAdam;
+    callback = callback,
+    maxiters = 3);
+pinit = result_neuralode.u;
+optprob = Optimization.OptimizationProblem(optf, pinit);
+```
+
+(Notice that the block above can be repeated to continue training)
+
+**Notice** that the training is rather slow, so realistically here you can not expect good results in a few iterations.
+
+and finally I use the trained CNODE to compare the solution with the target
+
+```julia
+trange = (0.0f0, 300.0f0)
+trained_CNODE = NeuralODE(f_closed_CNODE,
+    trange,
+    Tsit5(),
+    adaptive = false,
+    dt = dt,
+    saveat = saveat);
+trained_CNODE_solution = Array(trained_CNODE(uv0_coarse[:, 1:3], θ, st)[1]);
+u_trained = reshape(trained_CNODE_solution[1:(coarse_grid.Nu), :, :],
+    coarse_grid.nux,
+    coarse_grid.nuy,
+    size(trained_CNODE_solution, 2),
+    :);
+v_trained = reshape(trained_CNODE_solution[(coarse_grid.Nu + 1):end, :, :],
+    coarse_grid.nvx,
+    coarse_grid.nvy,
+    size(trained_CNODE_solution, 2),
+    :);
+anim = Animation()
+fig = plot(layout = (2, 5), size = (750, 300))
+@gif for i in 1:40:size(u_trained, 4)
+    p1 = heatmap(u[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Exact")
+    p2 = heatmap(v[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p3 = heatmap(u_trained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds,
+        title = "Trained")
+    p4 = heatmap(v_trained[:, :, 1, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    et = abs.(imresize(u[:, :, 1, i], coarse_grid.nux, coarse_grid.nuy) .-
+              u_trained[:, :, 1, i])
+    p5 = heatmap(et,
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :greens,
+        title = "Diff-u")
+    p6 = heatmap(u[:, :, 2, i], axis = false, cbar = false, aspect_ratio = 1, color = :reds)
+    p7 = heatmap(v[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    p8 = heatmap(u_trained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :reds)
+    p9 = heatmap(v_trained[:, :, 2, i],
+        axis = false,
+        cbar = false,
+        aspect_ratio = 1,
+        color = :blues)
+    e = abs.(imresize(u[:, :, 2, i], coarse_grid.nux, coarse_grid.nuy) .-
+             u_trained[:, :, 2, i])
+    p10 = heatmap(e, axis = false, cbar = false, aspect_ratio = 1, color = :greens)
+
+    time = round(i * saveat, digits = 0)
+    fig = plot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, layout = (2, 5), margin = 0mm)
+
+    frame(anim, fig)
+end
+if isdir("./plots")
+    gif(anim, "./plots/trained_GS.gif", fps = 10)
+else
+    gif(anim, "examples/plots/trained_GS.gif", fps = 10)
+end
+```
+
+---
+
+*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
+
diff --git a/examples/TODO_03.01-Advecion.ipynb b/examples/TODO_03.01-Advecion.ipynb
new file mode 100644
index 0000000..60e4b4e
--- /dev/null
+++ b/examples/TODO_03.01-Advecion.ipynb
@@ -0,0 +1,301 @@
+{
+ "cells": [
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "using Lux\n",
+    "using NaNMath\n",
+    "using SciMLSensitivity\n",
+    "using DiffEqFlux\n",
+    "using DifferentialEquations\n",
+    "using Plots\n",
+    "using Zygote\n",
+    "using Random\n",
+    "rng = Random.seed!(1234)\n",
+    "using OptimizationOptimisers\n",
+    "using Statistics\n",
+    "using ComponentArrays\n",
+    "using CUDA\n",
+    "ArrayType = CUDA.functional() ? CuArray : Array"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Import our custom backend functions"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "include(\"coupling_functions/functions_example.jl\")\n",
+    "include(\"coupling_functions/functions_NODE.jl\")\n",
+    "include(\"coupling_functions/functions_loss.jl\")"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We want to solve the convection diffusion equation"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "this is the equation we want to solve"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "write it as a system of first order ODEs"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "create the grid"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "x = collect(LinRange(-pi, pi, 101))\n",
+    "y = collect(LinRange(-pi, pi, 101))"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "so we get this dx and dy (constant grid)"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "dx = x[2] - x[1]\n",
+    "dy = y[2] - y[1]"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and the initial condition is random"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "c0 = rand(101, 101)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "which is a scalar field because we are looking for the concentration of a single species"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "the user specifies this equation"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "function gen_conv_diff_f(speed, viscosity, dx, dy)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Derivatives using finite differences"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "    function first_derivative(u, Δx, Δy)\n",
+    "        du_dx = zeros(size(u))\n",
+    "        du_dy = zeros(size(u))\n",
+    "\n",
+    "        du_dx[:, 2:(end - 1)] = (u[:, 3:end] - u[:, 1:(end - 2)]) / (2 * Δx)\n",
+    "        du_dx[:, 1] = (u[:, 2] - u[:, end]) / (2 * Δx)\n",
+    "        du_dx[:, end] = (u[:, 1] - u[:, end - 1]) / (2 * Δx)\n",
+    "\n",
+    "        du_dy[2:(end - 1), :] = (u[3:end, :] - u[1:(end - 2), :]) / (2 * Δy)\n",
+    "        du_dy[1, :] = (u[2, :] - u[end, :]) / (2 * Δy)\n",
+    "        du_dy[end, :] = (u[1, :] - u[end - 1, :]) / (2 * Δy)\n",
+    "\n",
+    "        return du_dx, du_dy\n",
+    "    end\n",
+    "    function second_derivative(u, Δx, Δy)\n",
+    "        d2u_dx2 = zeros(size(u))\n",
+    "        d2u_dy2 = zeros(size(u))\n",
+    "\n",
+    "        d2u_dx2[:, 2:(end - 1)] = (u[:, 3:end] - 2 * u[:, 2:(end - 1)] +\n",
+    "                                   u[:, 1:(end - 2)]) / (Δx^2)\n",
+    "        d2u_dx2[:, 1] = (u[:, 2] - 2 * u[:, 1] + u[:, end]) / (Δx^2)\n",
+    "        d2u_dx2[:, end] = (u[:, 1] - 2 * u[:, end] + u[:, end - 1]) / (Δx^2)\n",
+    "\n",
+    "        d2u_dy2[2:(end - 1), :] = (u[3:end, :] - 2 * u[2:(end - 1), :] +\n",
+    "                                   u[1:(end - 2), :]) / (Δy^2)\n",
+    "        d2u_dy2[1, :] = (u[2, :] - 2 * u[1, :] + u[end, :]) / (Δy^2)\n",
+    "        d2u_dy2[end, :] = (u[1, :] - 2 * u[end, :] + u[end - 1, :]) / (Δy^2)\n",
+    "\n",
+    "        return d2u_dx2, d2u_dy2\n",
+    "    end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Convection-diffusion equation"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "    function f_cd(u,\n",
+    "            t,\n",
+    "            ddx_dy = first_derivative,\n",
+    "            d2dx2_d2dy2 = second_derivative,\n",
+    "            viscosity = viscosity,\n",
+    "            speed = speed,\n",
+    "            dx = dx,\n",
+    "            dy = dy)\n",
+    "        du_dx, du_dy = ddx_ddy(u, dx, dy)\n",
+    "        d2u_dx2, d2u_dy2 = d2dx2_d2dy2(u, dx, dy)\n",
+    "        return -speed[1] * du_dx - speed[2] * du_dy .+ viscosity[1] * d2u_dx2 .+\n",
+    "               viscosity[2] * d2u_dy2\n",
+    "    end\n",
+    "    return f_cd\n",
+    "end"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "So this is the force"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_cd(u) = gen_conv_diff_f([0.1, 0.1], [0.00001, 0.00001], dx, dy)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "create the NN"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "NN = create_nn_cd()"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "* We create the right hand side of the NODE, by combining the NN with f_u"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "f_NODE = create_NODE_cd(NN, p; is_closed = true)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "and get the parametrs that you want to train"
+   ],
+   "metadata": {}
+  },
+  {
+   "outputs": [],
+   "cell_type": "code",
+   "source": [
+    "θ, st = Lux.setup(rng, f_NODE)"
+   ],
+   "metadata": {},
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "---\n",
+    "\n",
+    "*This notebook was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*"
+   ],
+   "metadata": {}
+  }
+ ],
+ "nbformat_minor": 3,
+ "metadata": {
+  "language_info": {
+   "file_extension": ".jl",
+   "mimetype": "application/julia",
+   "name": "julia",
+   "version": "1.10.2"
+  },
+  "kernelspec": {
+   "name": "julia-1.10",
+   "display_name": "Julia 1.10.2",
+   "language": "julia"
+  }
+ },
+ "nbformat": 4
+}
diff --git a/examples/TODO_03.01-Advecion.md b/examples/TODO_03.01-Advecion.md
new file mode 100644
index 0000000..f5ead17
--- /dev/null
+++ b/examples/TODO_03.01-Advecion.md
@@ -0,0 +1,142 @@
+```julia
+using Lux
+using NaNMath
+using SciMLSensitivity
+using DiffEqFlux
+using DifferentialEquations
+using Plots
+using Zygote
+using Random
+rng = Random.seed!(1234)
+using OptimizationOptimisers
+using Statistics
+using ComponentArrays
+using CUDA
+ArrayType = CUDA.functional() ? CuArray : Array
+```
+
+Import our custom backend functions
+
+```julia
+include("coupling_functions/functions_example.jl")
+include("coupling_functions/functions_NODE.jl")
+include("coupling_functions/functions_loss.jl")
+```
+
+We want to solve the convection diffusion equation
+
+this is the equation we want to solve
+
+write it as a system of first order ODEs
+
+create the grid
+
+```julia
+x = collect(LinRange(-pi, pi, 101))
+y = collect(LinRange(-pi, pi, 101))
+```
+
+so we get this dx and dy (constant grid)
+
+```julia
+dx = x[2] - x[1]
+dy = y[2] - y[1]
+```
+
+and the initial condition is random
+
+```julia
+c0 = rand(101, 101)
+```
+
+which is a scalar field because we are looking for the concentration of a single species
+
+the user specifies this equation
+
+```julia
+function gen_conv_diff_f(speed, viscosity, dx, dy)
+```
+
+Derivatives using finite differences
+
+```julia
+    function first_derivative(u, Δx, Δy)
+        du_dx = zeros(size(u))
+        du_dy = zeros(size(u))
+
+        du_dx[:, 2:(end - 1)] = (u[:, 3:end] - u[:, 1:(end - 2)]) / (2 * Δx)
+        du_dx[:, 1] = (u[:, 2] - u[:, end]) / (2 * Δx)
+        du_dx[:, end] = (u[:, 1] - u[:, end - 1]) / (2 * Δx)
+
+        du_dy[2:(end - 1), :] = (u[3:end, :] - u[1:(end - 2), :]) / (2 * Δy)
+        du_dy[1, :] = (u[2, :] - u[end, :]) / (2 * Δy)
+        du_dy[end, :] = (u[1, :] - u[end - 1, :]) / (2 * Δy)
+
+        return du_dx, du_dy
+    end
+    function second_derivative(u, Δx, Δy)
+        d2u_dx2 = zeros(size(u))
+        d2u_dy2 = zeros(size(u))
+
+        d2u_dx2[:, 2:(end - 1)] = (u[:, 3:end] - 2 * u[:, 2:(end - 1)] +
+                                   u[:, 1:(end - 2)]) / (Δx^2)
+        d2u_dx2[:, 1] = (u[:, 2] - 2 * u[:, 1] + u[:, end]) / (Δx^2)
+        d2u_dx2[:, end] = (u[:, 1] - 2 * u[:, end] + u[:, end - 1]) / (Δx^2)
+
+        d2u_dy2[2:(end - 1), :] = (u[3:end, :] - 2 * u[2:(end - 1), :] +
+                                   u[1:(end - 2), :]) / (Δy^2)
+        d2u_dy2[1, :] = (u[2, :] - 2 * u[1, :] + u[end, :]) / (Δy^2)
+        d2u_dy2[end, :] = (u[1, :] - 2 * u[end, :] + u[end - 1, :]) / (Δy^2)
+
+        return d2u_dx2, d2u_dy2
+    end
+```
+
+Convection-diffusion equation
+
+```julia
+    function f_cd(u,
+            t,
+            ddx_dy = first_derivative,
+            d2dx2_d2dy2 = second_derivative,
+            viscosity = viscosity,
+            speed = speed,
+            dx = dx,
+            dy = dy)
+        du_dx, du_dy = ddx_ddy(u, dx, dy)
+        d2u_dx2, d2u_dy2 = d2dx2_d2dy2(u, dx, dy)
+        return -speed[1] * du_dx - speed[2] * du_dy .+ viscosity[1] * d2u_dx2 .+
+               viscosity[2] * d2u_dy2
+    end
+    return f_cd
+end
+```
+
+So this is the force
+
+```julia
+f_cd(u) = gen_conv_diff_f([0.1, 0.1], [0.00001, 0.00001], dx, dy)
+```
+
+create the NN
+
+```julia
+NN = create_nn_cd()
+```
+
+* We create the right hand side of the NODE, by combining the NN with f_u
+
+```julia
+f_NODE = create_NODE_cd(NN, p; is_closed = true)
+```
+
+and get the parametrs that you want to train
+
+```julia
+θ, st = Lux.setup(rng, f_NODE)
+```
+
+---
+
+*This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
+