From a7cda29690c95edeab0df1b46044985521a83840 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Fri, 24 Sep 2021 20:59:00 +0100
Subject: [PATCH 001/324] Disable user option in makefile

---
 doc/Makefile | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/doc/Makefile b/doc/Makefile
index 3a6807783a..7e68d6ad2c 100755
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -8,28 +8,14 @@
 
 .PHONY : all user doxygen clean distclean
 
-all : user doxygen
-
-user :
-	pdflatex --shell-escape user.tex
-	pdflatex --shell-escape user.tex
-	-bibtex user
-	pdflatex --shell-escape user.tex
-	latex_count=8 ; \
-	while egrep -s 'Rerun (LaTeX|to get cross-references right)' user.log && [ $$latex_count -gt 0 ] ;\
-	    do \
-	      echo "Rerunning latex...." ;\
-	      pdflatex --shell-escape user.tex ;\
-	      latex_count=`expr $$latex_count - 1` ;\
-	    done
-
+all : doxygen
 doxygen :
 	doxygen ops/Doxyfile
 	cd ops/latex; make refman.pdf
 	doxygen ops_translator/Doxyfile
 	cd ops_translator/latex; make refman.pdf
 
-clean : 
+clean :
 	-rm -f *.out *.aux *.blg *.pyg.* *.log *.backup *.toc *~ *.bbl
 	-rm -rf _minted-user
 

From a9e93a497bcaa5e2d99759c78ecce6476d10ed7b Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Fri, 24 Sep 2021 21:25:50 +0100
Subject: [PATCH 002/324] Create documentation framework

---
 .readthedocs.yml    |  13 +
 doc/conf.py         |  60 ++++
 doc/index.rst       |  24 ++
 doc/installation.md |  94 +++++
 doc/introduction.md |  50 +++
 doc/keyconcept.md   | 102 ++++++
 doc/quickstart.md   |   3 +
 doc/requirement.txt |   2 +
 doc/user.md         | 839 ++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 1187 insertions(+)
 create mode 100644 .readthedocs.yml
 create mode 100644 doc/conf.py
 create mode 100644 doc/index.rst
 create mode 100644 doc/installation.md
 create mode 100644 doc/introduction.md
 create mode 100644 doc/keyconcept.md
 create mode 100644 doc/quickstart.md
 create mode 100644 doc/requirement.txt
 create mode 100644 doc/user.md

diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 0000000000..ceca0e737d
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,13 @@
+# File: .readthedocs.yaml
+
+version: 2
+
+# Build from the docs/ directory with Sphinx
+sphinx:
+  configuration: doc/conf.py
+
+# Explicitly set the version of Python and its requirements
+python:
+  version: 3.8
+  install:
+    - requirements: doc/requirement.txt
\ No newline at end of file
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000000..8be05822d8
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,60 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'Oxford Parallel library for Structured mesh solvers'
+copyright = 'Copyright (c) 2013, Mike Giles and others'
+author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
+
+# The full version, including alpha/beta/rc tags
+release = 'latest'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.mathjax',
+    'sphinx.ext.ifconfig',
+    'myst_parser'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+
+source_suffix = ['.rst', '.md']
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme" #'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
\ No newline at end of file
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000000..d4f72096a9
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,24 @@
+.. Test documentation master file, created by
+   sphinx-quickstart on Thu Sep 23 09:45:16 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to Test's documentation!
+================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   introduction.md
+   keyconcept.md
+   installation.md
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/doc/installation.md b/doc/installation.md
new file mode 100644
index 0000000000..c46de11093
--- /dev/null
+++ b/doc/installation.md
@@ -0,0 +1,94 @@
+# Installation
+
+**Note: The current CMakefile and relevant instructions are mainly tested on linux-based systems including Windows Subsystem for Linux**
+
+## Dependencies
+
+  * CMake
+
+  CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
+  ```bash
+  version=3.19.0
+  wget https://github.com/Kitware/CMake/releases/download/v$version/cmake-$version-Linux-x86_64.sh
+  # Assume that CMake is going to be installed at /usr/local/cmake
+  cmake_dir=/usr/local/cmake
+  # sudo is not necessary for directories in user space.
+  sudo mkdir $cmake_dir
+  sudo sh ./cmake-$version-Linux-x86_64.sh --prefix=$cmake_dir  --skip-license
+  sudo ln -s $cmake_dir/bin/cmake /usr/local/bin/cmake
+  ```
+
+  * Python2
+
+  **Python2** is required by the OPS Python translator. The CMake build system will try to identify it automatically. However, the process can fail sometime (e.g., if there are both Python2 and Python3 installed). If this happens, the path to Python2 can be specified manually by using **-DPython2_EXECUTABLE** when invoking CMake
+
+  * HDF5
+
+  [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
+
+  * CUDA
+
+  The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
+
+<!-- 1. Set up environmental variables:
+
+  * `CUDA_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications, only needed if CUDA cannot be found in standard locations, or to enable OpenCL)
+  * `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
+  * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
+
+
+## Build OPS back-end libraries example applications
+### Build the library and example applications together
+
+  Create a build directory, and run CMake (version 3.18 or newer)
+  ```bash
+  mkdir build
+  cd build
+  # Please see below for CMake options
+  cmake ${PATH_TO_OPS} -DBUILD_OPS_APPS=ON -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL -DGPU_NUMBER=1
+  make # IEEE=1 this option is important for applications to get accurate results
+  make install # sudo is needed if a directory like /usr/local/ is chosen.
+  ```
+After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
+
+###  Build the library and example applications separately
+
+In this mode, the library can be firstly built and installed as
+
+```bash
+  mkdir build
+  cd build
+  # Please see below for CMake options
+  cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
+  make # IEEE=1 this option is important for applications to get accurate results
+  make install # sudo is needed if a system direction is chosen,
+  ```
+then the application can be built as
+
+```bash
+  mkdir appbuild
+  cd appbuild
+  # Please see below for CMake options
+  cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
+  make # IEEE=1 this option is important for applications to get accurate results
+  ```
+### Tests
+
+A few tasks for testing codes can be run by
+```bash
+  make test
+  ```
+The current tests are mainly based on the applications.
+### Options of interest to specify to `cmake` include:
+
+  * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
+  * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
+  * `-DOPS_TEST=ON` - enable the tests
+  * `-DCMAKE_INSTALL_PREFIX=` - specify the installation direction for the library (/usr/local by default, Library CMake only)
+  * `-DAPP_INSTALL_DIR=` - specify the installation direction for the applications ($HOME/OPS-APPS by default)
+  * `-DGPU_NUMBER=` - specify the number of GPUs used in the tests
+  * `-DOPS_INSTALL_DIR=` - specify where the OPS library is installed (Application CMake only, see [here](#build-the-library-and-example-applications-separately))
+  * `-DOPS_VERBOSE_WARNING=ON` - show verbose output during building process
+  <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
+  <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
+
diff --git a/doc/introduction.md b/doc/introduction.md
new file mode 100644
index 0000000000..515fcee001
--- /dev/null
+++ b/doc/introduction.md
@@ -0,0 +1,50 @@
+# Introduction
+
+OPS is a high-level framework with associated libraries and
+preprocessors to generate parallel executables for applications on
+**multi-block structured grids**. Multi-block structured grids consists
+of an unstructured collection of structured meshes/grids. This document
+describes the OPS C++ API, which supports the development of
+single-block and multi-block structured meshes.
+
+Many of the API and library follows the structure of the OP2 high-level
+library for unstructured mesh applications [@op2]. However the
+structured mesh domain is distinct from the unstructured mesh
+applications domain due to the implicit connectivity between
+neighbouring mesh elements (such as vertices, cells) in structured
+meshes/grids. The key idea is that operations involve looping over a
+"rectangular" multi-dimensional set of grid points using one or more
+"stencils" to access data. In multi-block grids, we have several
+structured blocks. The connectivity between the faces of different
+blocks can be quite complex, and in particular they may not be oriented
+in the same way, i.e. an $i,j$ face of one block may correspond to the
+$j,k$ face of another block. This is awkward and hard to handle simply.
+
+To clarify some of the important issues in designing the API, we note
+here some needs connected with a 3D application:
+
+-   When looping over the interior with loop indices $i,j,k$, often
+    there are 1D arrays which are referenced using just one of the
+    indices.
+
+-   To implement boundary conditions, we often loop over a 2D face,
+    accessing both the 3D dataset and data from a 2D dataset.
+
+-   To implement periodic boundary conditions using dummy "halo" points,
+    we sometimes have to copy one plane of boundary data to another.
+    e.g. if the first dimension has size $I$ then we might copy the
+    plane $i=I\!-\!2$ to plane $i=0$, and plane $i=1$ to plane
+    $i=I\!-\!1$.
+
+-   In multigrid, we are working with two grids with one having twice as
+    many points as the other in each direction. To handle this we
+    require a stencil with a non-unit stride.
+
+-   In multi-block grids, we have several structured blocks. The
+    connectivity between the faces of different blocks can be quite
+    complex, and in particular they may not be oriented in the same way,
+    i.e. an $i,j$ face of one block may correspond to the $j,k$ face of
+    another block. This is awkward and hard to handle simply.
+
+The latest proposal is to handle all of these different requirements
+through stencil definitions.
\ No newline at end of file
diff --git a/doc/keyconcept.md b/doc/keyconcept.md
new file mode 100644
index 0000000000..bd26fc25b5
--- /dev/null
+++ b/doc/keyconcept.md
@@ -0,0 +1,102 @@
+# Key concepts and structure
+
+An OPS application can generally be divided into two key parts:
+initialisation and parallel execution. During the initialisation phase,
+one or more blocks (ops_block) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a
+block, and have a specific size (in each dimension of the block), which
+may be slightly different across different datasets (e.g. staggered
+grids), in some directions they may be degenerate (a size of 1), or they
+can represent data associated with different multigrid levels (where
+their size if a multiple or a fraction of other datasets). Datasets can
+be declared with empty (NULL) pointers, then OPS will allocate the
+appropriate amount of memory, may be passed non-NULL pointers (currently
+only supported in non-MPI environments), in which case OPS will assume
+the memory is large enough for the data and the block halo, and there
+are HDF5 dataset declaration routines which allow the distributed
+reading of datasets from HDF5 files. The concept of blocks is necessary
+to group datasets together, as in a multi-block problem, in a
+distributed memory environment, OPS needs to be able to determine how to
+decompose the problem.
+
+The initialisation phase usually also consists of defining the stencils
+to be used later on (though they can be defined later as well), which
+describe the data access patterns used in parallel loops. Stencils are
+always relative to the "current" point; e.g. if at iteration $(i,j)$, we
+wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two
+points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in
+one of the dimensions the dataset's size is 1), as well as for
+multigrid, there are special strided, restriction, and prolongation
+stencils: they differ from normal stencils in that as one steps through
+a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a
+degenerate dataset called xcoords, size $(N,1)$, then we will need a
+stencil with stride $(1,0)$ to access it in a regular 2D loop.
+
+Finally, the initialisation phase may declare a number of global
+constants - these are variables in global scope that can be accessed
+from within user kernels, without having to pass them in explicitly.
+These may be scalars or small arrays, generally for values that do not
+change during execution, though they may be updated during execution
+with repeated calls to `ops_decl_const`.
+
+The initialisation phase is terminated by a call to `ops_partition`.
+
+The bulk of the application consists of parallel loops, implemented
+using calls to `ops_par_loop`. These constructs work with datasets,
+passed through the opaque `ops_dat` handles declared during the
+initialisation phase. The iterations of parallel loops are semantically
+independent, and it is the responsibility of the user to enforce this:
+the order in which iterations are executed cannot affect the result
+(within the limits of floating point precision). Parallel loops are
+defined on a block, with a prescribed iteration range that is always
+defined from the perspective of the dataset written/modified (the sizes
+of datasets, particularly in multigrid situations, may be very
+different). Datasets are passed in using `ops_arg_dat`, and during
+execution, values at the current grid point will be passed to the user
+kernel. These values are passed wrapped in a templated `ACC<>` object
+(templated on the type of the data), whose parentheses operator is
+overloaded, which the user must use to specify the relative offset to
+access the grid point's neighbours (which accesses have to match the the
+declared stencil). Datasets written may only be accessed with a
+one-point, zero-offset stencil (otherwise the parallel semantics may be
+violated).
+
+Other than datasets, one can pass in read-only scalars or small arrays
+that are iteration space invariant with `ops_arg_gbl` (typically
+weights, $\delta t$, etc. which may be different in different loops).
+The current iteration index can also be passed in with `ops_arg_idx`,
+which will pass a globally consistent index to the user kernel (i.e.
+also under MPI).
+
+Reductions in loops are done using the ops_arg_reduce argument, which
+takes a reduction handle as an argument. The result of the reduction can
+then be acquired using a separate call to `ops_reduction_result`. The
+semantics are the following: a reduction handle after it was declared is
+in an "uninitialised" state. The first time it is used as an argument to
+a loop, its type is determined (increment/min/max), and is initialised
+appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in
+parallel loops are combined together, up until the point, where the
+result is acquired using `ops_reduction_result`, which then sets it back
+to an uninitialised state. This also implies, that different parallel
+loops, which all use the same reduction handle, but are otherwise
+independent, are independent and their partial reduction results can be
+combined together associatively and commutatively.
+
+OPS takes responsibility for all data, its movement and the execution of
+parallel loops. With different execution hardware and optimisations,
+this means OPS will re-organise data as well as execution (potentially
+across different loops), and therefore any data accesses or manipulation
+may only be done through the OPS API.
+
+This restriction is exploited by a lazy execution mechanism in OPS. The
+idea is that OPS API calls that do not return a result can be not
+executed immediately, rather queued, and once an API call requires
+returning some data, operations in the queue are executed, and the
+result is returned. This allows OPS to analyse and optimise operations
+in the queue together. This mechanism is fully automated by OPS, and is
+used with the various \_tiled executables. For more information on how
+to use this mechanism for improving CPU performance, see Section
+[\[sec:tiling\]](#sec:tiling){reference-type="ref"
+reference="sec:tiling"}. Some API calls triggering the execution of
+queued operations include ops_reduction_result, and the functions in the
+data access API.
\ No newline at end of file
diff --git a/doc/quickstart.md b/doc/quickstart.md
new file mode 100644
index 0000000000..93813d43d9
--- /dev/null
+++ b/doc/quickstart.md
@@ -0,0 +1,3 @@
+# Quick start
+## How to use math
+$$\alpha$$
\ No newline at end of file
diff --git a/doc/requirement.txt b/doc/requirement.txt
new file mode 100644
index 0000000000..9af8e80a61
--- /dev/null
+++ b/doc/requirement.txt
@@ -0,0 +1,2 @@
+ # We set the tools needed by sphinx
+ myst-parser ==  0.15.2
diff --git a/doc/user.md b/doc/user.md
new file mode 100644
index 0000000000..d3ebca8478
--- /dev/null
+++ b/doc/user.md
@@ -0,0 +1,839 @@
+---
+author:
+- Mike Giles, Istvan Reguly, Gihan Mudalige
+date: May 2019
+title: OPS C++ User's Manual
+---
+
+
+
+
+
+# OPS C++ API
+
+## Initialisation declaration and termination routines
+
+###  {#section .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the usual command line arguments
+
+an integer which defines the level of debugging diagnostics and
+reporting to be performed
+:::
+
+Currently, higher `diags_level`s does the following checks\
+`diags_level` $=$ 1 : no diagnostics, default to achieve best runtime
+performance.\
+`diags_level` $>$ 1 : print block decomposition and `ops_par_loop`
+timing breakdown.\
+`diags_level` $>$ 4 : print intra-block halo buffer allocation feedback
+(for OPS internal development only)\
+`diags_level` $>$ 5 : check if intra-block halo MPI sends depth match
+MPI receives depth (for OPS internal development only)\
+
+###  {#section-1 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dimension of the block
+
+a name used for output diagnostics
+:::
+
+###  {#section-2 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dimension of the block
+
+a name used for output diagnostics
+
+hdf5 file to read and obtain the block information from
+:::
+
+Although this routine does not read in any extra information about the
+block from the named HDF5 file than what is already specified in the
+arguments, it is included here for error checking (e.g. check if blocks
+defined in an HDF5 file is matching with the declared arguments in an
+application) and completeness.\
+
+###  {#section-3 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+structured block
+
+dimension of dataset (number of items per grid element)
+
+size in each dimension of the block
+
+base indices in each dimension of the block
+
+padding from the face in the negative direction for each dimension (used
+for block halo)
+
+padding from the face in the positive direction for each dimension (used
+for block halo)
+
+input data of type `T`
+
+the name of type used for output diagnostics (e.g. "double", "float")
+
+a name used for output diagnostics
+:::
+
+The `size` allows to declare different sized data arrays on a given
+`block`. `d_m` and `d_p` are depth of the "block halos" that are used to
+indicate the offset from the edge of a block (in both the negative and
+positive directions of each dimension).\
+\
+
+###  {#section-4 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+structured block
+
+dimension of dataset (number of items per grid element)
+
+the name of type used for output diagnostics (e.g. "double", "float")
+
+name of the dat used for output diagnostics
+
+hdf5 file to read and obtain the data from
+:::
+
+###  {#section-5 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+a name used to identify the constant
+
+dimension of dataset (number of items per element)
+
+the name of type used for output diagnostics (e.g. "double", "float")
+
+pointer to input data of type `T`
+:::
+
+###  {#section-6 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+a name used to identify the constant
+
+dimension of dataset (number of items per element)
+
+the name of type used for output diagnostics (e.g. "double", "float")
+
+pointer to new values for constant of type `T`
+:::
+
+###  {#section-7 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+origin dataset
+
+destination dataset
+
+defines an iteration size (number of indices to iterate over in each
+direction)
+
+indices of starting point in \"from\" dataset
+
+indices of starting point in \"to\" dataset
+
+direction of incrementing for \"from\" for each dimension of `iter_size`
+
+direction of incrementing for \"to\" for each dimension of `iter_size`
+:::
+
+A from_dir \[1,2\] and a to_dir \[2,1\] means that x in the first block
+goes to y in the second block, and y in first block goes to x in second
+block. A negative sign indicates that the axis is flipped. (Simple
+example: a transfer from (1:2,0:99,0:99) to (-1:0,0:99,0:99) would use
+iter_size = \[2,100,100\], from_base = \[1,0,0\], to_base = \[-1,0,0\],
+from_dir = \[0,1,2\], to_dir = \[0,1,2\]. In more complex case this
+allows for transfers between blocks with different orientations.)\
+
+###  {#section-8 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+origin dataset
+
+destination dataset
+
+hdf5 file to read and obtain the data from
+:::
+
+###  {#section-9 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+number of halos in `halos`
+
+array of halos
+:::
+
+###  {#section-10 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+size of data in bytes
+
+the name of type used for output diagnostics (e.g. "double", "float")
+
+name of the dat used for output diagnostics
+:::
+
+::: list
+plus 1pt minus 1pt
+
+the `ops_reduction` handle
+
+a pointer to write the results to, memory size has to match the declared
+:::
+
+###  {#section-11 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+string describing the partitioning method. Currently this string is not
+used internally, but is simply a place-holder to indicate different
+partitioning methods in the future.
+:::
+
+###  {#section-12 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+:::
+
+## Diagnostics and output routines
+
+###  {#section-13 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+:::
+
+###  {#section-14 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+:::
+
+###  {#section-15 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+variable to hold the CPU time at the time of invocation
+
+variable to hold the elapsed time at the time of invocation
+:::
+
+###  {#section-16 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+ops_block to be written
+
+hdf5 file to write to
+:::
+
+###  {#section-17 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+ops_stencil to be written
+
+hdf5 file to write to
+:::
+
+###  {#section-18 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+ops_dat to be written
+
+hdf5 file to write to
+:::
+
+###  {#section-19 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+ops_dat to to be written
+
+text file to write to
+:::
+
+###  {#section-20 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+output stream, use stdout to print to standard out
+:::
+
+###  {#section-21 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+ops_dat to to be checked
+:::
+
+## Halo exchange
+
+###  {#section-22 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the halo group
+:::
+
+## Parallel loop syntax
+
+A parallel loop with N arguments has the following syntax:
+
+###  {#section-23 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+user's kernel function with N arguments
+
+name of kernel function, used for output diagnostics
+
+the ops_block over which this loop executes
+
+dimension of loop iteration
+
+iteration range array
+
+arguments
+:::
+
+The **ops_arg** arguments in **ops_par_loop** are provided by one of the
+following routines, one for global constants and reductions, and the
+other for OPS datasets.
+
+###  {#section-24 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+data array
+
+array dimension
+
+string representing the type of data held in data
+
+access type
+:::
+
+###  {#section-25 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+an `ops_reduction` handle
+
+array dimension (according to `type`)
+
+string representing the type of data held in data
+
+access type
+:::
+
+###  {#section-26 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dataset
+
+stencil for accessing data
+
+string representing the type of data held in dataset
+
+access type
+:::
+
+###  {#section-27 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+:::
+
+## Stencils
+
+The final ingredient is the stencil specification, for which we have two
+versions: simple and strided.\
+
+###  {#section-28 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dimension of loop iteration
+
+number of points in the stencil
+
+stencil for accessing data
+
+string representing the name of the stencil
+:::
+
+###  {#section-29 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dimension of loop iteration
+
+number of points in the stencil
+
+stencil for accessing data
+
+stride for accessing data
+
+string representing the name of the stencil\
+:::
+
+###  {#section-30 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dimension of loop iteration
+
+number of points in the stencil
+
+string representing the name of the stencil
+
+hdf5 file to write to
+:::
+
+In the strided case, the semantics for the index of data to be accessed,
+for stencil point `p`, in dimension `m` are defined as:\
+,\
+where `loop_index[m]` is the iteration index (within the user-defined
+iteration space) in the different dimensions.
+
+If, for one or more dimensions, both `stride[m]` and `stencil[p*dims+m]`
+are zero, then one of the following must be true;
+
+-   the dataset being referenced has size 1 for these dimensions
+
+-   these dimensions are to be omitted and so the dataset has dimension
+    equal to the number of remaining dimensions.
+
+See `OPS/apps/c/CloverLeaf/build_field.cpp` and
+`OPS/apps/c/CloverLeaf/generate.cpp` for an example
+`ops_decl_strided_stencil` declaration and its use in a loop,
+respectively.\
+These two stencil definitions probably take care of all of the cases in
+the Introduction except for multiblock applications with interfaces with
+different orientations -- this will need a third, even more general,
+stencil specification. The strided stencil will handle both multigrid
+(with a stride of 2 for example) and the boundary condition and reduced
+dimension applications (with a stride of 0 for the relevant dimensions).
+
+## Checkpointing
+
+OPS supports the automatic checkpointing of applications. Using the API
+below, the user specifies the file name for the checkpoint and an
+average time interval between checkpoints, OPS will then automatically
+save all necessary information periodically that is required to
+fast-forward to the last checkpoint if a crash occurred. Currently, when
+re-launching after a crash, the same number of MPI processes have to be
+used. To enable checkpointing mode, the `OPS_CHECKPOINT` runtime
+argument has to be used.\
+
+###  {#section-31 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+name of the file for checkpointing. In MPI, this will automatically be
+post-fixed with the rank ID.
+
+average time (seconds) between checkpoints
+
+a combinations of flags, listed in `ops_checkpointing.h`:\
+OPS_CHECKPOINT_INITPHASE - indicates that there are a number of parallel
+loops at the very beginning of the simulations which should be excluded
+from any checkpoint; mainly because they initialise datasets that do not
+change during the main body of the execution. During restore mode these
+loops are executed as usual. An example would be the computation of the
+mesh geometry, which can be excluded from the checkpoint if it is
+re-computed when recovering and restoring a checkpoint. The API call
+void `ops_checkpointing_initphase_done()` indicates the end of this
+initial phase.
+
+OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually
+controls the location of the checkpoint, and explicitly specifies the
+list of `ops_dat`s to be saved.
+
+OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the
+location of the checkpoint, and it also enables fast-forwarding, by
+skipping the execution of the application (even though none of the
+parallel loops would actually execute, there may be significant work
+outside of those) up to the checkpoint.
+
+OPS_CHECKPOINT_MANUAL - Indicates that when the corresponding API
+function is called, the checkpoint should be created. Assumes the
+presence of the above two options as well.
+:::
+
+###  {#section-32 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+number of datasets to be saved
+
+arrays of `ops_dat` handles to be saved
+:::
+
+###  {#section-33 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+size of the payload in bytes
+
+pointer to memory into which the payload is packed
+:::
+
+###  {#section-34 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+number of datasets to be saved
+
+arrays of `ops_dat` handles to be saved
+
+size of the payload in bytes
+
+pointer to memory into which the payload is packed
+:::
+
+###  {#section-35 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+number of datasets to be saved
+
+arrays of `ops_dat` handles to be saved
+
+size of the payload in bytes
+
+pointer to memory into which the payload is packed
+:::
+
+The suggested use of these **manual** functions is of course when the
+optimal location for checkpointing is known - one of the ways to
+determine that is to use the built-in algorithm. More details of this
+will be reported in a tech-report on checkpointing, to be published
+later.
+
+## Access to OPS data
+
+This section describes APIS that give the user access to internal data
+structures in OPS and return data to user-space. These should be used
+cautiously and sparsely, as they can affect performance significantly
+
+###  {#section-36 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+:::
+
+###  {#section-37 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+:::
+
+###  {#section-38 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+an array populated with the displacement of the chunk within the
+"global" distributed array
+
+an array populated with the spatial extents
+:::
+
+###  {#section-39 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+an array populated with the displacement of the chunk within the
+"global" distributed array
+
+an array populated with the spatial extents
+
+an array populated strides in spatial dimensions needed for column-major
+indexing
+
+an array populated with padding on the left in each dimension. Note that
+these are negative values
+
+an array populated with padding on the right in each dimension
+:::
+
+###  {#section-40 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+a stencil used to determine required MPI halo exchange depths
+
+when set to OPS_HOST or OPS_DEVICE, returns a pointer to data in that
+memory space, otherwise must be set to 0, and returns whether data is in
+the host or on the device
+:::
+
+###  {#section-41 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+the kind of access that was used by the user (OPS_READ if it was read
+only, OPS_WRITE if it was overwritten, OPS_RW if it was read and
+written)
+:::
+
+###  {#section-42 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+the kind of access that was used by the user (OPS_READ if it was read
+only, OPS_WRITE if it was overwritten, OPS_RW if it was read and
+written)
+
+set to OPS_HOST or OPS_DEVICE
+:::
+
+###  {#section-43 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+pointer to memory which should be filled by OPS
+:::
+
+###  {#section-44 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+pointer to memory which should be copied to OPS
+:::
+
+# Tiling for Cache-blocking
+
+OPS has a code generation (ops_gen_mpi_lazy) and build target for
+tiling. Once compiled, to enable, use the `OPS_TILING` runtime parameter
+- this will look at the L3 cache size of your CPU and guess the correct
+tile size. If you want to alter the amount of cache to be used for the
+guess, use the `OPS_CACHE_SIZE=XX` runtime parameter, where the value is
+in Megabytes. To manually specify the tile sizes, use the
+OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments.
+
+When MPI is combined with OpenMP tiling can be extended to the MPI
+halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
+halos for multiple `ops_par_loops` can be exchanged with a single MPI
+message (see [@TPDS2017] for more details)\
+To test, compile CloverLeaf under `apps/c/CloverLeaf`, modify clover.in
+to use a $6144^2$ mesh, then run as follows:\
+For OpenMP with tiling:\
+`export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING`\
+For MPI+OpenMP with tiling:\
+`export OMP_NUM_THREADS=xx; mpirun -np xx ./cloverleaf_mpi_tiled OPS_TILING OPS_TILING_MAXDEPTH=6`\
+To manually specify the tile sizes (in number of grid points), use the
+OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:\
+`export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200 `
+
+# CUDA and OpenCL Runtime Arguments
+
+The CUDA (and OpenCL) thread block sizes can be controlled by setting
+the `OPS_BLOCK_SIZE_X, OPS_BLOCK_SIZE_Y` and `OPS_BLOCK_SIZE_Z` runtime
+arguments. For example :\
+`./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4`\
+`OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the
+code on.\
+Usually `OPS_CL_DEVICE=0` selects the CPU and `OPS_CL_DEVICE=1` selects
+GPUs.
+
+# Executing with GPUDirect
+
+GPU direct support for MPI+CUDA, to enable (on the OPS side) add
+**-gpudirect** when running the executable. You may also have to use
+certain environmental flags when using different MPI distributions. For
+an example of the required flags and environmental settings on the
+Cambridge Wilkes2 GPU cluster see:\
+<https://docs.hpc.cam.ac.uk/hpc/user-guide/performance-tips.html>
+
+# OPS User Kernels
+
+In OPS, the elemental operation carried out per mesh/grid point is
+specified as an outlined function called a *user kernel*. An example
+taken from the Cloverleaf application is given in Figure
+[\[fig:example\]](#fig:example){reference-type="ref"
+reference="fig:example"}.\
+
+``` {.cpp mathescape="" linenos="" startFrom="1" numbersep="0pt" gobble="2" frame="lines" framesep="1mm"}
+void accelerate_kernel( const ACC<double> &density0, const ACC<double> &volume,
+                ACC<double> &stepbymass, const ACC<double> &xvel0, ACC<double> &xvel1,
+                const ACC<double> &xarea, const ACC<double> &pressure,
+                const ACC<double> &yvel0, ACC<double> &yvel1,
+                const ACC<double> &yarea, const ACC<double> &viscosity) {
+
+  double nodal_mass;
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  nodal_mass = ( density0(-1,-1) * volume(-1,-1)
+    + density0(0,-1) * volume(0,-1)
+    + density0(0,0) * volume(0,0)
+    + density0(-1,0) * volume(-1,0) ) * 0.25;
+
+  stepbymass(0,0) = 0.5*dt/ nodal_mass;
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, 0,-1};
+
+  xvel1(0,0) = xvel0(0,0) - stepbymass(0,0) *
+            ( xarea(0,0)  * ( pressure(0,0) - pressure(-1,0) ) +
+              xarea(0,-1) * ( pressure(0,-1) - pressure(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, -1,0};
+
+  yvel1(0,0) = yvel0(0,0) - stepbymass(0,0) *
+            ( yarea(0,0)  * ( pressure(0,0) - pressure(0,-1) ) +
+              yarea(-1,0) * ( pressure(-1,0) - pressure(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, 0,-1};
+
+  xvel1(0,0) = xvel1(0,0) - stepbymass(0,0) *
+            ( xarea(0,0) * ( viscosity(0,0) - viscosity(-1,0) ) +
+              xarea(0,-1) * ( viscosity(0,-1) - viscosity(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, -1,0};
+
+  yvel1(0,0) = yvel1(0,0) - stepbymass(0,0) *
+            ( yarea(0,0) * ( viscosity(0,0) - viscosity(0,-1) ) +
+              yarea(-1,0) * ( viscosity(-1,0) - viscosity(-1,-1) ) );
+
+
+}
+```
+
+[\[fig:example\]]{#fig:example label="fig:example"}
+
+\
+\
+\
+\
+This user kernel is then used in an `ops_par_loop` (Figure
+[\[fig:parloop\]](#fig:parloop){reference-type="ref"
+reference="fig:parloop"}). The key aspect to note in the user kernel in
+Figure [\[fig:example\]](#fig:example){reference-type="ref"
+reference="fig:example"} is the use of the ACC\<\> objects and their
+parentheses operator. These specify the stencil in accessing the
+elements of the respective data arrays.
+
+``` {.cpp mathescape="" linenos="" startFrom="1" numbersep="0pt" gobble="2" frame="lines" framesep="2mm"}
+    int rangexy_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1};
+
+    ops_par_loop(accelerate_kernel, "accelerate_kernel", clover_grid, 2, rangexy_inner_plus1,
+     ops_arg_dat(density0, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(volume, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE),
+     ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_READ),
+     ops_arg_dat(xvel1, 1, S2D_00, "double", OPS_INC),
+     ops_arg_dat(xarea, 1, S2D_00_0M1, "double", OPS_READ),
+     ops_arg_dat(pressure, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_READ),
+     ops_arg_dat(yvel1, 1, S2D_00, "double", OPS_INC),
+     ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ),
+     ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ));
+```
+
+[\[fig:parloop\]]{#fig:parloop label="fig:parloop"}
+
+::: thebibliography
+1 OP2 for Many-Core Platforms, 2013.
+<http://www.oerc.ox.ac.uk/projects/op2>
+
+Istvan Z. Reguly, G.R. Mudalige, Mike B. Giles. Loop Tiling in
+Large-Scale Stencil Codes at Run-time with OPS. (2017) IEEE Transactions
+on Parallel and Distributed Systems.
+<http://dx.doi.org/10.1109/TPDS.2017.2778161>
+:::

From b6e5d503c94d5b28ca31081cb652247fff9ccb43 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:08:29 +0100
Subject: [PATCH 003/324] Update index.rst

Update highlevel structure
---
 doc/index.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/index.rst b/doc/index.rst
index d4f72096a9..beef678a88 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -11,8 +11,12 @@ Welcome to Test's documentation!
    :caption: Contents:
 
    introduction.md
-   keyconcept.md
    installation.md
+   devanapp.md
+   keyconcept.md
+   opsapi.md
+   devdoc.md
+   pubs.md
 
 
 

From df79b8f4d00b34b301a57a5ec5d6d2a381824b51 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:10:46 +0100
Subject: [PATCH 004/324] Create devanapp.md

new file
---
 doc/devanapp.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 doc/devanapp.md

diff --git a/doc/devanapp.md b/doc/devanapp.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/doc/devanapp.md
@@ -0,0 +1 @@
+

From 5ffa2fe2831ba09b76f1bceba5ca462dc72400b1 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:11:11 +0100
Subject: [PATCH 005/324] Create opsapi.md

new file
---
 doc/opsapi.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 doc/opsapi.md

diff --git a/doc/opsapi.md b/doc/opsapi.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/doc/opsapi.md
@@ -0,0 +1 @@
+

From 07555296d1abaf2f0e549d3609ad62bc3dad3eae Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:11:44 +0100
Subject: [PATCH 006/324] Create devdoc.md

new file
---
 doc/devdoc.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 doc/devdoc.md

diff --git a/doc/devdoc.md b/doc/devdoc.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/doc/devdoc.md
@@ -0,0 +1 @@
+

From ff425f4900edc0f29324447fb6b36a7def726d5d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:12:16 +0100
Subject: [PATCH 007/324] Create pubs.md

new file
---
 doc/pubs.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 doc/pubs.md

diff --git a/doc/pubs.md b/doc/pubs.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/doc/pubs.md
@@ -0,0 +1 @@
+

From 7df80f55e692e9eb6ff24361158f067cc29dbc3f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:13:40 +0100
Subject: [PATCH 008/324] Update conf.py

update copyright
---
 doc/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index 8be05822d8..4d3b1059f1 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Oxford Parallel library for Structured mesh solvers'
-copyright = 'Copyright (c) 2013, Mike Giles and others'
+copyright = 'Copyright (c) 2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
 author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
 
 # The full version, including alpha/beta/rc tags
@@ -57,4 +57,4 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
\ No newline at end of file
+html_static_path = ['_static']

From ab67c78ceb3c2a2a9cce3a3ffa86e9dca960b17b Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:15:57 +0100
Subject: [PATCH 009/324] Update conf.py

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index 4d3b1059f1..c53bf5a553 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Oxford Parallel library for Structured mesh solvers'
-copyright = 'Copyright (c) 2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
+copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
 author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
 
 # The full version, including alpha/beta/rc tags

From bd96d0b7d7167d5bedf9487b1c6f9e7d5e096323 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:18:56 +0100
Subject: [PATCH 010/324] Update opsapi.md

moving key concepts
---
 doc/opsapi.md | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 8b13789179..be6f08cd98 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -1 +1,42 @@
 
+Many of the API and library follows the structure of the OP2 high-level
+library for unstructured mesh applications [@op2]. However the
+structured mesh domain is distinct from the unstructured mesh
+applications domain due to the implicit connectivity between
+neighbouring mesh elements (such as vertices, cells) in structured
+meshes/grids. The key idea is that operations involve looping over a
+"rectangular" multi-dimensional set of grid points using one or more
+"stencils" to access data. In multi-block grids, we have several
+structured blocks. The connectivity between the faces of different
+blocks can be quite complex, and in particular they may not be oriented
+in the same way, i.e. an $i,j$ face of one block may correspond to the
+$j,k$ face of another block. This is awkward and hard to handle simply.
+
+To clarify some of the important issues in designing the API, we note
+here some needs connected with a 3D application:
+
+-   When looping over the interior with loop indices $i,j,k$, often
+    there are 1D arrays which are referenced using just one of the
+    indices.
+
+-   To implement boundary conditions, we often loop over a 2D face,
+    accessing both the 3D dataset and data from a 2D dataset.
+
+-   To implement periodic boundary conditions using dummy "halo" points,
+    we sometimes have to copy one plane of boundary data to another.
+    e.g. if the first dimension has size $I$ then we might copy the
+    plane $i=I\!-\!2$ to plane $i=0$, and plane $i=1$ to plane
+    $i=I\!-\!1$.
+
+-   In multigrid, we are working with two grids with one having twice as
+    many points as the other in each direction. To handle this we
+    require a stencil with a non-unit stride.
+
+-   In multi-block grids, we have several structured blocks. The
+    connectivity between the faces of different blocks can be quite
+    complex, and in particular they may not be oriented in the same way,
+    i.e. an $i,j$ face of one block may correspond to the $j,k$ face of
+    another block. This is awkward and hard to handle simply.
+
+The latest proposal is to handle all of these different requirements
+through stencil definitions.

From 78767518340d548b685d699f3ca5e8c5cb34951e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:19:42 +0100
Subject: [PATCH 011/324] Update introduction.md

Introduction section structure update
---
 doc/introduction.md | 45 ++++-----------------------------------------
 1 file changed, 4 insertions(+), 41 deletions(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 515fcee001..d7d6a499c3 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -7,44 +7,7 @@ of an unstructured collection of structured meshes/grids. This document
 describes the OPS C++ API, which supports the development of
 single-block and multi-block structured meshes.
 
-Many of the API and library follows the structure of the OP2 high-level
-library for unstructured mesh applications [@op2]. However the
-structured mesh domain is distinct from the unstructured mesh
-applications domain due to the implicit connectivity between
-neighbouring mesh elements (such as vertices, cells) in structured
-meshes/grids. The key idea is that operations involve looping over a
-"rectangular" multi-dimensional set of grid points using one or more
-"stencils" to access data. In multi-block grids, we have several
-structured blocks. The connectivity between the faces of different
-blocks can be quite complex, and in particular they may not be oriented
-in the same way, i.e. an $i,j$ face of one block may correspond to the
-$j,k$ face of another block. This is awkward and hard to handle simply.
-
-To clarify some of the important issues in designing the API, we note
-here some needs connected with a 3D application:
-
--   When looping over the interior with loop indices $i,j,k$, often
-    there are 1D arrays which are referenced using just one of the
-    indices.
-
--   To implement boundary conditions, we often loop over a 2D face,
-    accessing both the 3D dataset and data from a 2D dataset.
-
--   To implement periodic boundary conditions using dummy "halo" points,
-    we sometimes have to copy one plane of boundary data to another.
-    e.g. if the first dimension has size $I$ then we might copy the
-    plane $i=I\!-\!2$ to plane $i=0$, and plane $i=1$ to plane
-    $i=I\!-\!1$.
-
--   In multigrid, we are working with two grids with one having twice as
-    many points as the other in each direction. To handle this we
-    require a stencil with a non-unit stride.
-
--   In multi-block grids, we have several structured blocks. The
-    connectivity between the faces of different blocks can be quite
-    complex, and in particular they may not be oriented in the same way,
-    i.e. an $i,j$ face of one block may correspond to the $j,k$ face of
-    another block. This is awkward and hard to handle simply.
-
-The latest proposal is to handle all of these different requirements
-through stencil definitions.
\ No newline at end of file
+## Overview
+## Licencing
+## Citing
+## Support

From 388bd5bf429ea8e453c1a6a077627c6ea96d8a83 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:29:12 +0100
Subject: [PATCH 012/324] Update installation.md

structure for the installation.md file
---
 doc/installation.md | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index c46de11093..faa8765b0a 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -36,9 +36,11 @@
   * `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
   * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
 
+## Obtaining OPS
 
 ## Build OPS back-end libraries example applications
-### Build the library and example applications together
+### Using `cmake`
+#### Build the library and example applications together
 
   Create a build directory, and run CMake (version 3.18 or newer)
   ```bash
@@ -51,7 +53,7 @@
   ```
 After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
 
-###  Build the library and example applications separately
+####  Build the library and example applications separately
 
 In this mode, the library can be firstly built and installed as
 
@@ -72,14 +74,14 @@ then the application can be built as
   cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
   make # IEEE=1 this option is important for applications to get accurate results
   ```
-### Tests
+#### Tests
 
 A few tasks for testing codes can be run by
 ```bash
   make test
   ```
 The current tests are mainly based on the applications.
-### Options of interest to specify to `cmake` include:
+#### `cmake` options
 
   * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
   * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
@@ -92,3 +94,15 @@ The current tests are mainly based on the applications.
   <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
   <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
 
+### Using regular `Makefiles`
+#### Build library
+#### Build application
+#### Makefile options
+
+## Running example applications
+### CloverLeaf
+### CloverLeaf_3D_HDF5
+### poisson
+### adi
+
+## Runtime flags and options

From 24e24d67b69cb2b5f1e0ca990ffa9931e7386f53 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:32:40 +0100
Subject: [PATCH 013/324] Update installation.md

---
 doc/installation.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index faa8765b0a..9ba6bbf668 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -1,4 +1,4 @@
-# Installation
+# Getting Started
 
 **Note: The current CMakefile and relevant instructions are mainly tested on linux-based systems including Windows Subsystem for Linux**
 
@@ -39,7 +39,7 @@
 ## Obtaining OPS
 
 ## Build OPS back-end libraries example applications
-### Using `cmake`
+### Using cmake
 #### Build the library and example applications together
 
   Create a build directory, and run CMake (version 3.18 or newer)
@@ -94,7 +94,7 @@ The current tests are mainly based on the applications.
   <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
   <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
 
-### Using regular `Makefiles`
+### Using regular Makefiles
 #### Build library
 #### Build application
 #### Makefile options

From 7099e05701cf89bce98ef1df15c28b75dde90c11 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:33:58 +0100
Subject: [PATCH 014/324] Update installation.md

---
 doc/installation.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 9ba6bbf668..a3230a1153 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -38,7 +38,7 @@
 
 ## Obtaining OPS
 
-## Build OPS back-end libraries example applications
+## Build OPS back-end libraries and example applications
 ### Using cmake
 #### Build the library and example applications together
 
@@ -95,8 +95,8 @@ The current tests are mainly based on the applications.
   <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
 
 ### Using regular Makefiles
-#### Build library
-#### Build application
+#### Build back-end library
+#### Build an application
 #### Makefile options
 
 ## Running example applications

From 1d57522fad8155b2e6370366e16fb3b4ecc7d415 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:39:50 +0100
Subject: [PATCH 015/324] Update index.rst

---
 doc/index.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/index.rst b/doc/index.rst
index beef678a88..0dcc8c4007 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -13,7 +13,6 @@ Welcome to Test's documentation!
    introduction.md
    installation.md
    devanapp.md
-   keyconcept.md
    opsapi.md
    devdoc.md
    pubs.md

From 756b398be57d6cc3dc60a493349111f4e8d68dbd Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:41:20 +0100
Subject: [PATCH 016/324] Update opsapi.md

---
 doc/opsapi.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index be6f08cd98..4a514efa42 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -1,3 +1,106 @@
+# Key concepts and structure
+
+An OPS application can generally be divided into two key parts:
+initialisation and parallel execution. During the initialisation phase,
+one or more blocks (ops_block) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a
+block, and have a specific size (in each dimension of the block), which
+may be slightly different across different datasets (e.g. staggered
+grids), in some directions they may be degenerate (a size of 1), or they
+can represent data associated with different multigrid levels (where
+their size if a multiple or a fraction of other datasets). Datasets can
+be declared with empty (NULL) pointers, then OPS will allocate the
+appropriate amount of memory, may be passed non-NULL pointers (currently
+only supported in non-MPI environments), in which case OPS will assume
+the memory is large enough for the data and the block halo, and there
+are HDF5 dataset declaration routines which allow the distributed
+reading of datasets from HDF5 files. The concept of blocks is necessary
+to group datasets together, as in a multi-block problem, in a
+distributed memory environment, OPS needs to be able to determine how to
+decompose the problem.
+
+The initialisation phase usually also consists of defining the stencils
+to be used later on (though they can be defined later as well), which
+describe the data access patterns used in parallel loops. Stencils are
+always relative to the "current" point; e.g. if at iteration $(i,j)$, we
+wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two
+points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in
+one of the dimensions the dataset's size is 1), as well as for
+multigrid, there are special strided, restriction, and prolongation
+stencils: they differ from normal stencils in that as one steps through
+a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a
+degenerate dataset called xcoords, size $(N,1)$, then we will need a
+stencil with stride $(1,0)$ to access it in a regular 2D loop.
+
+Finally, the initialisation phase may declare a number of global
+constants - these are variables in global scope that can be accessed
+from within user kernels, without having to pass them in explicitly.
+These may be scalars or small arrays, generally for values that do not
+change during execution, though they may be updated during execution
+with repeated calls to `ops_decl_const`.
+
+The initialisation phase is terminated by a call to `ops_partition`.
+
+The bulk of the application consists of parallel loops, implemented
+using calls to `ops_par_loop`. These constructs work with datasets,
+passed through the opaque `ops_dat` handles declared during the
+initialisation phase. The iterations of parallel loops are semantically
+independent, and it is the responsibility of the user to enforce this:
+the order in which iterations are executed cannot affect the result
+(within the limits of floating point precision). Parallel loops are
+defined on a block, with a prescribed iteration range that is always
+defined from the perspective of the dataset written/modified (the sizes
+of datasets, particularly in multigrid situations, may be very
+different). Datasets are passed in using `ops_arg_dat`, and during
+execution, values at the current grid point will be passed to the user
+kernel. These values are passed wrapped in a templated `ACC<>` object
+(templated on the type of the data), whose parentheses operator is
+overloaded, which the user must use to specify the relative offset to
+access the grid point's neighbours (which accesses have to match the the
+declared stencil). Datasets written may only be accessed with a
+one-point, zero-offset stencil (otherwise the parallel semantics may be
+violated).
+
+Other than datasets, one can pass in read-only scalars or small arrays
+that are iteration space invariant with `ops_arg_gbl` (typically
+weights, $\delta t$, etc. which may be different in different loops).
+The current iteration index can also be passed in with `ops_arg_idx`,
+which will pass a globally consistent index to the user kernel (i.e.
+also under MPI).
+
+Reductions in loops are done using the ops_arg_reduce argument, which
+takes a reduction handle as an argument. The result of the reduction can
+then be acquired using a separate call to `ops_reduction_result`. The
+semantics are the following: a reduction handle after it was declared is
+in an "uninitialised" state. The first time it is used as an argument to
+a loop, its type is determined (increment/min/max), and is initialised
+appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in
+parallel loops are combined together, up until the point, where the
+result is acquired using `ops_reduction_result`, which then sets it back
+to an uninitialised state. This also implies, that different parallel
+loops, which all use the same reduction handle, but are otherwise
+independent, are independent and their partial reduction results can be
+combined together associatively and commutatively.
+
+OPS takes responsibility for all data, its movement and the execution of
+parallel loops. With different execution hardware and optimisations,
+this means OPS will re-organise data as well as execution (potentially
+across different loops), and therefore any data accesses or manipulation
+may only be done through the OPS API.
+
+This restriction is exploited by a lazy execution mechanism in OPS. The
+idea is that OPS API calls that do not return a result can be not
+executed immediately, rather queued, and once an API call requires
+returning some data, operations in the queue are executed, and the
+result is returned. This allows OPS to analyse and optimise operations
+in the queue together. This mechanism is fully automated by OPS, and is
+used with the various \_tiled executables. For more information on how
+to use this mechanism for improving CPU performance, see Section
+[\[sec:tiling\]](#sec:tiling){reference-type="ref"
+reference="sec:tiling"}. Some API calls triggering the execution of
+queued operations include ops_reduction_result, and the functions in the
+data access API.
+
 
 Many of the API and library follows the structure of the OP2 high-level
 library for unstructured mesh applications [@op2]. However the
@@ -40,3 +143,6 @@ here some needs connected with a 3D application:
 
 The latest proposal is to handle all of these different requirements
 through stencil definitions.
+
+# OPS API - C++
+# OPS API - Fortran

From c2dea74308cd1470a0adbb6163a5a4597317484e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:42:04 +0100
Subject: [PATCH 017/324] Update opsapi.md

---
 doc/opsapi.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 4a514efa42..942d5dee10 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -144,5 +144,5 @@ here some needs connected with a 3D application:
 The latest proposal is to handle all of these different requirements
 through stencil definitions.
 
-# OPS API - C++
-# OPS API - Fortran
+# OPS API 
+

From 33015060228a9f34892e0028ddf2892219788db6 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:43:16 +0100
Subject: [PATCH 018/324] Update opsapi.md

---
 doc/opsapi.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 942d5dee10..408bf9e341 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -1,4 +1,6 @@
-# Key concepts and structure
+# OPS API
+
+## Key concepts and structure
 
 An OPS application can generally be divided into two key parts:
 initialisation and parallel execution. During the initialisation phase,
@@ -144,5 +146,5 @@ here some needs connected with a 3D application:
 The latest proposal is to handle all of these different requirements
 through stencil definitions.
 
-# OPS API 
+## OPS C++ API 
 

From ffebcc32cf37fdc93de67d93640f703868c6ea3e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:46:25 +0100
Subject: [PATCH 019/324] Update devanapp.md

---
 doc/devanapp.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 8b13789179..62ce0a66c0 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1 +1,5 @@
-
+# Developing and OPS Application
+## Tutorial
+## Supported paralleizations
+## Code-generation flags
+## File I/O

From 0ba280e81e5928b2199d5d098467b7f5e16ab53e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:50:00 +0100
Subject: [PATCH 020/324] Update devdoc.md

---
 doc/devdoc.md | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index 8b13789179..de190feabd 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -1 +1,11 @@
-
+# Developer Guide 
+## Code-generator
+### Frontend API parser
+### Target Parallel Templates
+### Elemental Kernel Transformations
+## Back-end library
+### Sequential and multi-threaded CPU
+### MPI and Partitioning 
+### HDF5 
+### CUDA
+### Cache blocking tiling and comm-avoiding optimizations

From b742a73958f8fd88f55cd06462c2da9820daa0bd Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:50:24 +0100
Subject: [PATCH 021/324] Update pubs.md

---
 doc/pubs.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/pubs.md b/doc/pubs.md
index 8b13789179..9aca12e2f5 100644
--- a/doc/pubs.md
+++ b/doc/pubs.md
@@ -1 +1,2 @@
 
+# Publications

From 98f13d18b3c4a34508bd00f23de200632c0c5b10 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:55:52 +0100
Subject: [PATCH 022/324] Update installation.md

---
 doc/installation.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index a3230a1153..626c246412 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -38,8 +38,8 @@
 
 ## Obtaining OPS
 
-## Build OPS back-end libraries and example applications
-### Using cmake
+## Build OPS Back-end Libraries and Example Applications
+### Using Cmake
 #### Build the library and example applications together
 
   Create a build directory, and run CMake (version 3.18 or newer)
@@ -94,15 +94,15 @@ The current tests are mainly based on the applications.
   <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
   <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
 
-### Using regular Makefiles
+### Using Makefiles
 #### Build back-end library
 #### Build an application
 #### Makefile options
 
-## Running example applications
+## Running Example Applications
 ### CloverLeaf
 ### CloverLeaf_3D_HDF5
 ### poisson
 ### adi
 
-## Runtime flags and options
+## Runtime Flags and Options

From 6216cb34a78114d42770a6f4efc6f475a5a0be85 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:03:24 +0100
Subject: [PATCH 023/324] Update installation.md

---
 doc/installation.md | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 626c246412..3f77adc265 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -4,9 +4,9 @@
 
 ## Dependencies
 
-  * CMake
+  **CMake**
 
-  CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
+CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
   ```bash
   version=3.19.0
   wget https://github.com/Kitware/CMake/releases/download/v$version/cmake-$version-Linux-x86_64.sh
@@ -18,28 +18,22 @@
   sudo ln -s $cmake_dir/bin/cmake /usr/local/bin/cmake
   ```
 
-  * Python2
+ **Python2**
 
-  **Python2** is required by the OPS Python translator. The CMake build system will try to identify it automatically. However, the process can fail sometime (e.g., if there are both Python2 and Python3 installed). If this happens, the path to Python2 can be specified manually by using **-DPython2_EXECUTABLE** when invoking CMake
+Python2 is required by the OPS Python translator. The CMake build system will try to identify it automatically. However, the process can fail sometime (e.g., if there are both Python2 and Python3 installed). If this happens, the path to Python2 can be specified manually by using `-DPython2_EXECUTABLE` when invoking CMake
 
-  * HDF5
+ **HDF5**
 
-  [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
+[HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
 
-  * CUDA
+ **CUDA**
 
-  The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
-
-<!-- 1. Set up environmental variables:
-
-  * `CUDA_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications, only needed if CUDA cannot be found in standard locations, or to enable OpenCL)
-  * `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
-  * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
+The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
 
 ## Obtaining OPS
 
 ## Build OPS Back-end Libraries and Example Applications
-### Using Cmake
+### Using cmake
 #### Build the library and example applications together
 
   Create a build directory, and run CMake (version 3.18 or newer)
@@ -74,14 +68,16 @@ then the application can be built as
   cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
   make # IEEE=1 this option is important for applications to get accurate results
   ```
-#### Tests
+<!-- #### Tests
 
 A few tasks for testing codes can be run by
 ```bash
   make test
   ```
+-->
+
 The current tests are mainly based on the applications.
-#### `cmake` options
+#### cmake options
 
   * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
   * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
@@ -94,6 +90,11 @@ The current tests are mainly based on the applications.
   <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
   <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
 
+<!-- 1. Set up environmental variables:
+* `CUDA_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications, only needed if CUDA cannot be found in standard locations, or to enable OpenCL)
+* `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
+* `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
+
 ### Using Makefiles
 #### Build back-end library
 #### Build an application

From 1ea8b546e5f41ebc287b9c198b05f673846127b9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:17:29 +0100
Subject: [PATCH 024/324] Update installation.md

---
 doc/installation.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index 3f77adc265..8313a36213 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -31,7 +31,10 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
 
 ## Obtaining OPS
-
+```bash
+git clone https://github.com/gihanmudalige/OPS.git
+```
+    
 ## Build OPS Back-end Libraries and Example Applications
 ### Using cmake
 #### Build the library and example applications together

From 47d0f59f21f1e6defebeb7a1a738f560319d15c9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:18:05 +0100
Subject: [PATCH 025/324] Update installation.md

---
 doc/installation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 8313a36213..ba3ea88b30 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -24,11 +24,11 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 
  **HDF5**
 
-[HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
+[HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using `-DHDF5_ROOT`.
 
  **CUDA**
 
-The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
+The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
 ## Obtaining OPS
 ```bash

From f119c57b386a0cc49e4d20e65971a04a55488700 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:24:34 +0100
Subject: [PATCH 026/324] Update installation.md

---
 doc/installation.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index ba3ea88b30..f843facf2d 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -35,9 +35,9 @@ The CMake build system will detect the tookit automatically. If the automatic pr
 git clone https://github.com/gihanmudalige/OPS.git
 ```
     
-## Build OPS Back-end Libraries and Example Applications
+## Build OPS
 ### Using cmake
-#### Build the library and example applications together
+#### Build library and example applications together
 
   Create a build directory, and run CMake (version 3.18 or newer)
   ```bash
@@ -50,7 +50,7 @@ git clone https://github.com/gihanmudalige/OPS.git
   ```
 After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
 
-####  Build the library and example applications separately
+####  Build library and example applications separately
 
 In this mode, the library can be firstly built and installed as
 

From afe1945e743b27fa5faebe788002ebdf05794d64 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:25:26 +0100
Subject: [PATCH 027/324] Update installation.md

---
 doc/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index f843facf2d..69b6045554 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -77,9 +77,9 @@ A few tasks for testing codes can be run by
 ```bash
   make test
   ```
+The current tests are mainly based on the applications.
 -->
 
-The current tests are mainly based on the applications.
 #### cmake options
 
   * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations

From 4f543d1accec12f24a3929989fc9ff936d312868 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:28:36 +0100
Subject: [PATCH 028/324] Update installation.md

---
 doc/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index 69b6045554..abaa5995f6 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -48,7 +48,7 @@ git clone https://github.com/gihanmudalige/OPS.git
   make # IEEE=1 this option is important for applications to get accurate results
   make install # sudo is needed if a directory like /usr/local/ is chosen.
   ```
-After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
+After installation, the library and the python translator can be found at the direcory specified by `CMAKE_INSTALL_PREFIX`, together with the executable files for applications at `APP_INSTALL_DIR`.
 
 ####  Build library and example applications separately
 

From 0f96c0161c803638b07e6706a134f7e7002b9f53 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:30:49 +0100
Subject: [PATCH 029/324] Update devanapp.md

---
 doc/devanapp.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 62ce0a66c0..37d3d1d069 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,4 +1,4 @@
-# Developing and OPS Application
+# Developing an OPS Application
 ## Tutorial
 ## Supported paralleizations
 ## Code-generation flags

From a00a78c805c04b318697800dc831354274e682ca Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:32:20 +0100
Subject: [PATCH 030/324] Update devanapp.md

---
 doc/devanapp.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 37d3d1d069..3004738caf 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,5 +1,5 @@
 # Developing an OPS Application
 ## Tutorial
-## Supported paralleizations
-## Code-generation flags
+## Supported Paralleizations
+## Code-generation Flags
 ## File I/O

From 7875b811ebe1897368dced496babffdde12fea98 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:32:54 +0100
Subject: [PATCH 031/324] Update devdoc.md

---
 doc/devdoc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index de190feabd..5e906c5729 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -3,7 +3,7 @@
 ### Frontend API parser
 ### Target Parallel Templates
 ### Elemental Kernel Transformations
-## Back-end library
+## Back-end Library
 ### Sequential and multi-threaded CPU
 ### MPI and Partitioning 
 ### HDF5 

From cfe49ab7643bddbb732c8b94f83289e12063b60f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 09:57:52 +0100
Subject: [PATCH 032/324] Create AUTHORS

---
 AUTHORS | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 AUTHORS

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000000..2d43b90743
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,10 @@
+  
+List of Authors 
+
+Mike Giles
+Gihan Mudalige
+Istvan Reguly
+Daniel Balogh
+Toby Flynn
+Satya Jammy
+Jianping Meng 

From f9040d2192c0aa48ea3be28bd5cd82275003af3a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:33:46 +0100
Subject: [PATCH 033/324] Update installation.md

---
 doc/installation.md | 58 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index abaa5995f6..3e92a58771 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -55,28 +55,28 @@ After installation, the library and the python translator can be found at the di
 In this mode, the library can be firstly built and installed as
 
 ```bash
-  mkdir build
-  cd build
-  # Please see below for CMake options
-  cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
-  make # IEEE=1 this option is important for applications to get accurate results
-  make install # sudo is needed if a system direction is chosen,
-  ```
+mkdir build
+cd build
+# Please see below for CMake options
+cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
+make # IEEE=1 this option is important for applications to get accurate results
+make install # sudo is needed if a system direction is chosen,
+```
 then the application can be built as
 
 ```bash
-  mkdir appbuild
-  cd appbuild
-  # Please see below for CMake options
-  cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
-  make # IEEE=1 this option is important for applications to get accurate results
-  ```
+mkdir appbuild
+cd appbuild
+# Please see below for CMake options
+cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
+make # IEEE=1 this option is important for applications to get accurate results
+```
 <!-- #### Tests
 
 A few tasks for testing codes can be run by
 ```bash
-  make test
-  ```
+make test
+```
 The current tests are mainly based on the applications.
 -->
 
@@ -99,8 +99,34 @@ The current tests are mainly based on the applications.
 * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
 
 ### Using Makefiles
+#### Set up environmental variables:
+
+  * `OPS_COMPILER` - compiler to be used (Currently supports Intel, PGI and Cray compilers, but others can be easily incorporated by extending the Makefiles used in step 2 and 3)
+  * `OPS_INSTALL_PATH` - Installation directory of OPS/ops
+  * `CUDA_INSTALL_PATH - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications)
+  * `OPENCL_INSTALL_PATH` - Installation directory of OpenCL, usually `/usr/local/cuda` for NVIDIA OpenCL implementation (to build OpenCL libs and applications)
+  * `MPI_INSTALL_PATH` - Installation directory of MPI (to build MPI based distributed memory libs and applications)
+  * `HDF5_INSTALL_PATH` - Installation directory of HDF5 (to support HDF5 based File I/O)
+
+See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) under `OPS/ops/` that sets up the environment for building with various compilers (Intel, PGI, Cray).
+
 #### Build back-end library
-#### Build an application
+For C/C++ back-end use Makefile under `OPS/ops/c` (modify Makefile if required). The libraries will be built in `OPS/ops/c/lib`
+```bash
+cd $OPS_INSTALL_PATH/c
+make
+```
+For Fortran back-end use Makefile under `OPS/ops/fortran` (modify Makefile if required). The libraries will be built in `OPS/ops/fortran/lib`
+```bash
+cd $OPS_INSTALL_PATH/fortran
+make
+```
+#### Build exampe applications
+For example to build CloverLeaf_3D under `OPS/apps/c/CloverLeaf_3D`
+```bash  
+cd ../apps/c/Cloverleaf_3D/
+make
+```  
 #### Makefile options
 
 ## Running Example Applications

From 85b95fdd1f2cdf5a04af9a1741bb3e12063c03a8 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:36:35 +0100
Subject: [PATCH 034/324] Update installation.md

---
 doc/installation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 3e92a58771..73c09bec39 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -108,7 +108,7 @@ The current tests are mainly based on the applications.
   * `MPI_INSTALL_PATH` - Installation directory of MPI (to build MPI based distributed memory libs and applications)
   * `HDF5_INSTALL_PATH` - Installation directory of HDF5 (to support HDF5 based File I/O)
 
-See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) under `OPS/ops/` that sets up the environment for building with various compilers (Intel, PGI, Cray).
+See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) under `OPS/ops/scripts` that sets up the environment for building with various compilers (Intel, PGI, Cray).
 
 #### Build back-end library
 For C/C++ back-end use Makefile under `OPS/ops/c` (modify Makefile if required). The libraries will be built in `OPS/ops/c/lib`
@@ -127,7 +127,7 @@ For example to build CloverLeaf_3D under `OPS/apps/c/CloverLeaf_3D`
 cd ../apps/c/Cloverleaf_3D/
 make
 ```  
-#### Makefile options
+<!---#### Makefile options -->
 
 ## Running Example Applications
 ### CloverLeaf

From 09303e9b9b2340b684615cc5a1d9461aa23d611f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:41:18 +0100
Subject: [PATCH 035/324] Update installation.md

---
 doc/installation.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/installation.md b/doc/installation.md
index 73c09bec39..95c787fae4 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -30,6 +30,7 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 
 The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
+
 ## Obtaining OPS
 ```bash
 git clone https://github.com/gihanmudalige/OPS.git

From 83eccbef5e7bc494fdf800ddf2bd2faf2d2635dd Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:45:15 +0100
Subject: [PATCH 036/324] Create apps.md

---
 doc/apps.md | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 doc/apps.md

diff --git a/doc/apps.md b/doc/apps.md
new file mode 100644
index 0000000000..efcea9dbfa
--- /dev/null
+++ b/doc/apps.md
@@ -0,0 +1,4 @@
+# Example Applications
+## CloverLeaf  (2D, 3D and HDF5)
+## poisson
+## adi

From 1414c2a5a024197fa2d09cd3302db1dfa42d8043 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:45:40 +0100
Subject: [PATCH 037/324] Update index.rst

---
 doc/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/index.rst b/doc/index.rst
index 0dcc8c4007..73f406e704 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -14,6 +14,7 @@ Welcome to Test's documentation!
    installation.md
    devanapp.md
    opsapi.md
+   apps.md
    devdoc.md
    pubs.md
 

From 435f7f592cdf97cc1c587f3cf55e41842c3e9e36 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:45:48 +0100
Subject: [PATCH 038/324] Update installation.md

---
 doc/installation.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 95c787fae4..f69ec8c08c 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -130,10 +130,4 @@ make
 ```  
 <!---#### Makefile options -->
 
-## Running Example Applications
-### CloverLeaf
-### CloverLeaf_3D_HDF5
-### poisson
-### adi
-
 ## Runtime Flags and Options

From 33e1e621a5e25e99ddbc6228191af0155f4156a6 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:51:16 +0100
Subject: [PATCH 039/324] Update conf.py

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index c53bf5a553..c92c236874 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Oxford Parallel library for Structured mesh solvers'
-copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
+copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and [others](https://github.com/OP-DSL/OPS/blob/master/AUTHORS)'
 author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
 
 # The full version, including alpha/beta/rc tags

From 74b02ef8f64ddeacb3281d1da96b532c228c3a44 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:54:58 +0100
Subject: [PATCH 040/324] Update conf.py

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index c92c236874..10044a49ca 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Oxford Parallel library for Structured mesh solvers'
-copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and [others](https://github.com/OP-DSL/OPS/blob/master/AUTHORS)'
+copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and [https://github.com/OP-DSL/OPS/blob/master/AUTHORS]'
 author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
 
 # The full version, including alpha/beta/rc tags

From 826a1ccfe3344a5e02402de6ddbc140232fbdfd6 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:56:25 +0100
Subject: [PATCH 041/324] Update conf.py

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index 10044a49ca..c53bf5a553 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Oxford Parallel library for Structured mesh solvers'
-copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and [https://github.com/OP-DSL/OPS/blob/master/AUTHORS]'
+copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
 author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
 
 # The full version, including alpha/beta/rc tags

From 37febb145bb74c1d8e8ec7102b198d8b3dcf1b9c Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 11:02:09 +0100
Subject: [PATCH 042/324] Delete README

---
 README | 75 ----------------------------------------------------------
 1 file changed, 75 deletions(-)
 delete mode 100644 README

diff --git a/README b/README
deleted file mode 100644
index f7aa601f52..0000000000
--- a/README
+++ /dev/null
@@ -1,75 +0,0 @@
-OPS is an API with associated libraries and pre-processors to generate
-parallel executables for applications on mulit-block structured grids.
-
-This repository contains the implementation of the run-time library
-and the pre-processor, and is structured as follows:
-
-|
-`- ops: Implementation of the user and run-time OPS C/C++ APIs
-|
-`- apps: Application examples in C and Fortran
-|  These are examples of user application code and also include
-|  the target code an OPS pre-processor should produce to correctly
-|  use the OPS run-time library.
-|
-`- translator: Python OPS pre-processor for C/C++ API
-|
-`- doc: Documentation
-
-Installation
-============
-
-1. Set up environmental variables:
-
-  OPS_COMPILER - compiler to be used (Currently supports Intel, PGI and 
-  Cray compilers, but others can be easily incorporated by extending the 
-  Makefiles used in step 2 and 3)
-
-  OPS_INSTALL_PATH - Installation directory of OPS/ops
-  
-  CUDA_INSTALL_PATH - Installation directory of CUDA, 
-  usually /usr/local/cuda (to build CUDA libs and applications)
-
-  OPENCL_INSTALL_PATH - Installation directory of OpenCL, 
-  usually /usr/local/cuda for NVIDIA OpenCL implementation 
-  (to build OpenCL libs and applications)
-
-  MPI_INSTALL_PATH - Installation directory of MPI (to build MPI 
-  based distributed memory libs and applications)
-
-  HDF5_INSTALL_PATH - Installation directory of HDF5 
-  (to support HDF5 based File I/O)
-
-  See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) 
-  under OPS/ops/ that sets up the environment for building with various 
-  compilers (Intel, PGI, Cray).
-
-2. Build OPS back-end libraries.
-
-  For C/C++ back-end use Makefile under OPS/ops/c (modify Makefile if required). 
-  The libraries will be built in OPS/ops/c/lib
-  
-  cd $OPS_INSTALL_PATH/c
-  make
-
-  
-  For Fortran back-end use Makefile under OPS/ops/fortran 
-  (modify Makefile if required). The libraries will be built in OPS/ops/fortran/lib
-  
-  cd $OPS_INSTALL_PATH/fortran
-  make
-  
-
-3. Build OPS example applications
-
-  For example to build CloverLeaf_3D under OPS/apps/c/CloverLeaf_3D
-  
-  cd ../apps/c/Cloverleaf_3D/
-  make
-  
-
-How to cite
-===========
-Istvan Z Reguly, G.R Mudalige, Mike B Giles. Loop Tiling in Large-Scale 
-Stencil Codes at Run-time with OPS. (2017) IEEE Transactions on Parallel 
-and Distributed Systems. (http://dx.doi.org/10.1109/TPDS.2017.2778161)

From bdd2fc522affe42c90d3defb9600e942e19852e8 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 11:26:27 +0100
Subject: [PATCH 043/324] Update README.md

---
 README.md | 137 ++++++++++++------------------------------------------
 1 file changed, 31 insertions(+), 106 deletions(-)

diff --git a/README.md b/README.md
index a75182274c..68b548ecfd 100644
--- a/README.md
+++ b/README.md
@@ -1,117 +1,42 @@
-## OPS
+# OPS
 
-OPS is an API with associated libraries and pre-processors to generate
-parallel executables for applications on multi-block structured grids.
+OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
 
+This repository contains the implementation of the back-end library and the code-generator, and is structured as follows:
 
-This repository contains the implementation of the run-time library
-and the pre-processor, and is structured as follows:
+* `ops`: Implementation of the user and run-time OPS C/C++ APIs
+* `apps`: Application examples in C.
+  These are examples of user application code and also include the target parallel code generated by the OPS code generator.
+* `ops_translator`: Python OPS code generator for C/C++ API
+* `scripts` : example scripts for setting environmental variables and testing applications
+* `cmake` : cmake installation files
+* `makefiles` : makefile based installation files
+* `doc`: Documentation
 
-* ops: Implementation of the user and run-time OPS C/C++ APIs
+## Documentation
 
-* apps: Application examples in C.
-  These are examples of user application code and also include
-  the target code an OPS pre-processor should produce to correctly
-  use the OPS run-time library.
-  Currently the main application developed with OPS is a single
-  block structured mesh application - Cloverleaf originally
-  developed at https://github.com/Warwick-PCAV/CloverLeaf
+OPS documentation can be viewed on [Read the Docs](https://ops-dsl.readthedocs.io/).
 
-* translator: Python OPS pre-processor for C/C++ API
+## Citing
+To cite OPS, please reference the following paper:
 
-* doc: Documentation
+[I. Z. Reguly, G. R. Mudalige and M. B. Giles, Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS, in IEEE Transactions on Parallel and Distributed Systems, vol. 29, no. 4, pp. 873-886, 1 April 2018, doi: 10.1109/TPDS.2017.2778161.](https://ieeexplore.ieee.org/abstract/document/8121995)
 
-#### Installation
+```
+@ARTICLE{Reguly_et_al_2018,
+  author={Reguly, István Z. and Mudalige, Gihan R. and Giles, Michael B.},
+  journal={IEEE Transactions on Parallel and Distributed Systems}, 
+  title={Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS}, 
+  year={2018},
+  volume={29},
+  number={4},
+  pages={873-886},
+  doi={10.1109/TPDS.2017.2778161}}
+```
 
-**Note: The current CMakefile and relevant instructions are mainly tested on linux-based systems including Windows Subsystem for Linux**
+## Contact
+If you wish to report a bug with the software, please contact the [OP-DSL team](https://op-dsl.github.io/about.html)
 
-##### Dependencies
-
-  * CMake
-
-  CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
-  ```bash
-  version=3.19.0
-  wget https://github.com/Kitware/CMake/releases/download/v$version/cmake-$version-Linux-x86_64.sh
-  # Assume that CMake is going to be installed at /usr/local/cmake
-  cmake_dir=/usr/local/cmake
-  # sudo is not necessary for directories in user space.
-  sudo mkdir $cmake_dir
-  sudo sh ./cmake-$version-Linux-x86_64.sh --prefix=$cmake_dir  --skip-license
-  sudo ln -s $cmake_dir/bin/cmake /usr/local/bin/cmake
-  ```
-
-  * Python2
-
-  **Python2** is required by the OPS Python translator. The CMake build system will try to identify it automatically. However, the process can fail sometime (e.g., if there are both Python2 and Python3 installed). If this happens, the path to Python2 can be specified manually by using **-DPython2_EXECUTABLE** when invoking CMake
-
-  * HDF5
-
-  [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
-
-  * CUDA
-
-  The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
-
-<!-- 1. Set up environmental variables:
-
-  * `CUDA_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications, only needed if CUDA cannot be found in standard locations, or to enable OpenCL)
-  * `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
-  * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
-
-
-##### Build OPS back-end libraries example applications
-###### Build the library and example applications together
-
-  Create a build directory, and run CMake (version 3.18 or newer)
-  ```bash
-  mkdir build
-  cd build
-  # Please see below for CMake options
-  cmake ${PATH_TO_OPS} -DBUILD_OPS_APPS=ON -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL -DGPU_NUMBER=1
-  make # IEEE=1 this option is important for applications to get accurate results
-  make install # sudo is needed if a directory like /usr/local/ is chosen.
-  ```
-After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
-
-######  Build the library and example applications separately
-
-In this mode, the library can be firstly built and installed as
-
-```bash
-  mkdir build
-  cd build
-  # Please see below for CMake options
-  cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
-  make # IEEE=1 this option is important for applications to get accurate results
-  make install # sudo is needed if a system direction is chosen,
-  ```
-then the application can be built as
-
-```bash
-  mkdir appbuild
-  cd appbuild
-  # Please see below for CMake options
-  cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
-  make # IEEE=1 this option is important for applications to get accurate results
-  ```
-###### Tests
-
-A few tasks for testing codes can be run by
-```bash
-  make test
-  ```
-The current tests are mainly based on the applications.
-###### Options of interest to specify to `cmake` include:
-
-  * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
-  * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
-  * `-DOPS_TEST=ON` - enable the tests
-  * `-DCMAKE_INSTALL_PREFIX=` - specify the installation direction for the library (/usr/local by default, Library CMake only)
-  * `-DAPP_INSTALL_DIR=` - specify the installation direction for the applications ($HOME/OPS-APPS by default)
-  * `-DGPU_NUMBER=` - specify the number of GPUs used in the tests
-  * `-DOPS_INSTALL_DIR=` - specify where the OPS library is installed (Application CMake only, see [here](#build-the-library-and-example-applications-separately))
-  * `-DOPS_VERBOSE_WARNING=ON` - show verbose output during building process
-  <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
-  <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
+## Licence 
+OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
 

From 1b95b2089c01b2baeda2952c77054f9a721b7f00 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 11:41:46 +0100
Subject: [PATCH 044/324] Update README.md

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 68b548ecfd..6d63ce13cc 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
 # OPS
 
+[![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/build.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
+[![Documentation Status](https://ops-dsl.readthedocs.io/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)
+
 OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
 
 This repository contains the implementation of the back-end library and the code-generator, and is structured as follows:

From 59b1236e6c4f7c11becf957bc9f2e46d54f9e367 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:01:40 +0100
Subject: [PATCH 045/324] Update README.md

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6d63ce13cc..356f3338bf 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
 # OPS
 
-[![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/build.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
-[![Documentation Status](https://ops-dsl.readthedocs.io/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)
+[![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/pipeline.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
+[![Documentation Status](https://readthedocs.org/projects/ops-dsl/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)
+
 
 OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
 

From 530fd78a59ecb275b7cb1e876e9780637619a8d8 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:03:22 +0100
Subject: [PATCH 046/324] Update README.md

---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 356f3338bf..2077d3fd19 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,10 @@
 # OPS
 
+OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
+
 [![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/pipeline.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
 [![Documentation Status](https://readthedocs.org/projects/ops-dsl/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)
 
-
-OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
-
 This repository contains the implementation of the back-end library and the code-generator, and is structured as follows:
 
 * `ops`: Implementation of the user and run-time OPS C/C++ APIs

From fee9702d33a804c5801ef2de895f0ada52314fa0 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:06:05 +0100
Subject: [PATCH 047/324] Update index.rst

---
 doc/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/index.rst b/doc/index.rst
index 73f406e704..f991bca140 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -3,7 +3,7 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to Test's documentation!
+Welcome to OPS documentation!
 ================================
 
 .. toctree::

From ac767713f32a76d207def2f96097ddea32dde2c3 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:18:31 +0100
Subject: [PATCH 048/324] Update installation.md

---
 doc/installation.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index f69ec8c08c..349277b097 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -86,8 +86,8 @@ The current tests are mainly based on the applications.
   * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
   * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
   * `-DOPS_TEST=ON` - enable the tests
-  * `-DCMAKE_INSTALL_PREFIX=` - specify the installation direction for the library (/usr/local by default, Library CMake only)
-  * `-DAPP_INSTALL_DIR=` - specify the installation direction for the applications ($HOME/OPS-APPS by default)
+  * `-DCMAKE_INSTALL_PREFIX=` - specify the installation direction for the library (`/usr/local` by default, Library CMake only)
+  * `-DAPP_INSTALL_DIR=` - specify the installation direction for the applications (`$HOME/OPS-APPS` by default)
   * `-DGPU_NUMBER=` - specify the number of GPUs used in the tests
   * `-DOPS_INSTALL_DIR=` - specify where the OPS library is installed (Application CMake only, see [here](#build-the-library-and-example-applications-separately))
   * `-DOPS_VERBOSE_WARNING=ON` - show verbose output during building process
@@ -104,7 +104,7 @@ The current tests are mainly based on the applications.
 
   * `OPS_COMPILER` - compiler to be used (Currently supports Intel, PGI and Cray compilers, but others can be easily incorporated by extending the Makefiles used in step 2 and 3)
   * `OPS_INSTALL_PATH` - Installation directory of OPS/ops
-  * `CUDA_INSTALL_PATH - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications)
+  * `CUDA_INSTALL_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications)
   * `OPENCL_INSTALL_PATH` - Installation directory of OpenCL, usually `/usr/local/cuda` for NVIDIA OpenCL implementation (to build OpenCL libs and applications)
   * `MPI_INSTALL_PATH` - Installation directory of MPI (to build MPI based distributed memory libs and applications)
   * `HDF5_INSTALL_PATH` - Installation directory of HDF5 (to support HDF5 based File I/O)

From 069826fc09d5fa0fc0d700ecdc33cb17ba2c260e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:26:41 +0100
Subject: [PATCH 049/324] Update installation.md

---
 doc/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index 349277b097..c3ba25f83c 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -7,7 +7,7 @@
   **CMake**
 
 CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
-  ```bash
+  ```bash {r}
   version=3.19.0
   wget https://github.com/Kitware/CMake/releases/download/v$version/cmake-$version-Linux-x86_64.sh
   # Assume that CMake is going to be installed at /usr/local/cmake

From 28b6ff06d6ed18ffb89039672d6d8a27503716f0 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:36:37 +0100
Subject: [PATCH 050/324] Update introduction.md

---
 doc/introduction.md | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index d7d6a499c3..4ac754d449 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -1,13 +1,29 @@
 # Introduction
 
-OPS is a high-level framework with associated libraries and
-preprocessors to generate parallel executables for applications on
-**multi-block structured grids**. Multi-block structured grids consists
-of an unstructured collection of structured meshes/grids. This document
-describes the OPS C++ API, which supports the development of
-single-block and multi-block structured meshes.
-
 ## Overview
+
+OPS is a high-level framework with associated libraries and preprocessors to generate parallel executables for applications on **multi-block structured grids**. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. This document describes the OPS C++ API, which supports the development of single-block and multi-block structured meshes.
+
 ## Licencing
+OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
+
 ## Citing
+To cite OPS, please reference the following paper:
+
+[I. Z. Reguly, G. R. Mudalige and M. B. Giles, Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS, in IEEE Transactions on Parallel and Distributed Systems, vol. 29, no. 4, pp. 873-886, 1 April 2018, doi: 10.1109/TPDS.2017.2778161.](https://ieeexplore.ieee.org/abstract/document/8121995)
+
+```
+@ARTICLE{Reguly_et_al_2018,
+  author={Reguly, István Z. and Mudalige, Gihan R. and Giles, Michael B.},
+  journal={IEEE Transactions on Parallel and Distributed Systems}, 
+  title={Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS}, 
+  year={2018},
+  volume={29},
+  number={4},
+  pages={873-886},
+  doi={10.1109/TPDS.2017.2778161}}
+```
+Full list of publications from the OPS project can be found in the [Publications](https://opensbli.readthedocs.io/en/latest/citing.html) section.
+
 ## Support
+The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by  contacting the [OP-DSL team](https://op-dsl.github.io/about.html). 

From ce9f7515080e18eb83c7440356fdae49cc9c541f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:37:36 +0100
Subject: [PATCH 051/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 4ac754d449..c35c0cac9d 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -23,7 +23,7 @@ To cite OPS, please reference the following paper:
   pages={873-886},
   doi={10.1109/TPDS.2017.2778161}}
 ```
-Full list of publications from the OPS project can be found in the [Publications](https://opensbli.readthedocs.io/en/latest/citing.html) section.
+Full list of publications from the OPS project can be found in the [Publications](https://ops-dsl.readthedocs.io/en/markdowndocdev/pubs.html) section.
 
 ## Support
 The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by  contacting the [OP-DSL team](https://op-dsl.github.io/about.html). 

From 0eca7b834d642f4cc5d04192c1ab3bec3ca830ec Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:40:39 +0100
Subject: [PATCH 052/324] Update apps.md

---
 doc/apps.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/apps.md b/doc/apps.md
index efcea9dbfa..b023ec04ee 100644
--- a/doc/apps.md
+++ b/doc/apps.md
@@ -1,4 +1,4 @@
-# Example Applications
+# Examples
 ## CloverLeaf  (2D, 3D and HDF5)
 ## poisson
 ## adi

From f7675a41f5c2c22fac43ac3dfb5def81de99cd09 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:47:20 +0100
Subject: [PATCH 053/324] Update installation.md

---
 doc/installation.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/installation.md b/doc/installation.md
index c3ba25f83c..babf9e69e1 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -27,9 +27,16 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using `-DHDF5_ROOT`.
 
  **CUDA**
+The CUDA backend targets NVIDIA GPUs with a compute capability of 3.0 or greater.
 
 The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
+**HIP**
+The HIP backend targets AMD GPUs which are supported by the ROCm stack
+
+**SYCL**
+
+**Tridiagonal Solver**
 
 ## Obtaining OPS
 ```bash

From 5ece5ef435bfba232dd6d3ea97e2630e6c4c35c4 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:47:33 +0100
Subject: [PATCH 054/324] Update installation.md

---
 doc/installation.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/installation.md b/doc/installation.md
index babf9e69e1..bb90d33c65 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -32,6 +32,7 @@ The CUDA backend targets NVIDIA GPUs with a compute capability of 3.0 or greater
 The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
 **HIP**
+
 The HIP backend targets AMD GPUs which are supported by the ROCm stack
 
 **SYCL**

From a5bd7a539fec1043ca5e1e2a7601a94298333c94 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:51:21 +0100
Subject: [PATCH 055/324] Update installation.md

---
 doc/installation.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index bb90d33c65..eac7ec076e 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -27,9 +27,8 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using `-DHDF5_ROOT`.
 
  **CUDA**
-The CUDA backend targets NVIDIA GPUs with a compute capability of 3.0 or greater.
-
-The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
+ 
+The [CUDA](https://developer.nvidia.com/cuda-downloads) backend targets NVIDIA GPUs with a compute capability of 3.0 or greater. The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
 **HIP**
 

From 90ae6d0b36e82749170338c4227f5ba1fde3197a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:58:33 +0100
Subject: [PATCH 056/324] Update installation.md

---
 doc/installation.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index eac7ec076e..f91be6b8c6 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -39,8 +39,9 @@ The HIP backend targets AMD GPUs which are supported by the ROCm stack
 **Tridiagonal Solver**
 
 ## Obtaining OPS
+The latest OPS source code can be obtained by cloning the [OPS repository](https://github.com/OP-DSL/OPS) using
 ```bash
-git clone https://github.com/gihanmudalige/OPS.git
+git clone https://github.com/OP-DSL/OPS.git
 ```
     
 ## Build OPS

From ccb079ca5404aa59389a7e18d74061a0c5bdb5d6 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 15:20:21 +0100
Subject: [PATCH 057/324] Update installation.md

---
 doc/installation.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/installation.md b/doc/installation.md
index f91be6b8c6..4967718e60 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -38,6 +38,12 @@ The HIP backend targets AMD GPUs which are supported by the ROCm stack
 
 **Tridiagonal Solver**
 
+To use the tridiagonal solver OPS API in applications and build example applications such as `adi`, `adi_burger` and `adi_burger_3D` the open source tridiagonal solver (scalar) library needs to be cloned and built from the [Tridsolver repository](https://github.com/OP-DSL/tridsolver). 
+```bash
+git clone https://github.com/OP-DSL/tridsolver.git
+```
+Details on building scalar tridiagonal solver library can be found in the [README](https://github.com/OP-DSL/tridsolver/blob/master/scalar/README) file located at the appropriate subdirectory.
+
 ## Obtaining OPS
 The latest OPS source code can be obtained by cloning the [OPS repository](https://github.com/OP-DSL/OPS) using
 ```bash

From 981a8825a475d1fc295d999978d6b3b9f69b4a29 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 15:23:53 +0100
Subject: [PATCH 058/324] Update installation.md

---
 doc/installation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 4967718e60..ec6acbc0c8 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -60,7 +60,7 @@ git clone https://github.com/OP-DSL/OPS.git
   cd build
   # Please see below for CMake options
   cmake ${PATH_TO_OPS} -DBUILD_OPS_APPS=ON -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL -DGPU_NUMBER=1
-  make # IEEE=1 this option is important for applications to get accurate results
+  make # IEEE=1 enable IEEE flags in compiler
   make install # sudo is needed if a directory like /usr/local/ is chosen.
   ```
 After installation, the library and the python translator can be found at the direcory specified by `CMAKE_INSTALL_PREFIX`, together with the executable files for applications at `APP_INSTALL_DIR`.
@@ -74,7 +74,7 @@ mkdir build
 cd build
 # Please see below for CMake options
 cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
-make # IEEE=1 this option is important for applications to get accurate results
+make # IEEE=1 enable IEEE flags in compiler
 make install # sudo is needed if a system direction is chosen,
 ```
 then the application can be built as

From 9055199d2d5e02d21f05a566ce5258e740923dec Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 15:51:16 +0100
Subject: [PATCH 059/324] Update introduction.md

---
 doc/introduction.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/introduction.md b/doc/introduction.md
index c35c0cac9d..73497527aa 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -27,3 +27,8 @@ Full list of publications from the OPS project can be found in the [Publications
 
 ## Support
 The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by  contacting the [OP-DSL team](https://op-dsl.github.io/about.html). 
+
+## Funding
+The development of OPS was in part supported by the UK Engineering and Physical Sciences Research Council (EPSRC) grants [EP/K038567/1](http://gow.epsrc.ac.uk/NGBOViewGrant.aspx?GrantRef=EP/K038567/1) (“Future-proof massively-parallel execution of multi-block applications”), [EP/J010553/1](http://gow.epsrc.ac.uk/NGBOViewGrant.aspx?GrantRef=EP/J010553/1) (“Software for Emerging Architectures - ASEArch"), The UK Turbulence Consortium grant [EP/T026170/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/T026170/1), The Janos Bolyai Research Scholarship of the Hungarian Academy of Sciences, the Royal Society through their Industry Fellowship Scheme (INF/R1/180012), and the Thematic Research Cooperation Establishing Innovative Informatic and Info-communication Solutions Project, which has been supported by the European Union and co-financed by the European Social Fund under grant number EFOP-3.6.2-16-2017-00013. Research funding support was also provided by the UK AWE under grants CDK0660 ("The Production of Predictive Models for Future Computing Requirements"), CDK0724 ("AWE Technical Outreach Programme"), AWE grant for "High-level Abstractions for Performance, Portability and Continuity of Scientific Software on Future Computing Systems" and the Numerical Algorithms Group [NAG](https://www.nag.com/).
+
+Hardware resources for development and testing provided by the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725, the [ARCHER](http://www.archer.ac.uk) and ARCHER2(https://www.archer2.ac.uk/) UK National Supercomputing Service, [University of Oxford Advanced Research Computing (ARC) facility](http://dx.doi.org/10.5281/zenodo.22558) and through hardware donations and access provided by NVIDIA and Intel.

From fee20a588f3f7676a305bb1f7f62a51e127d3e9f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 15:59:29 +0100
Subject: [PATCH 060/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 73497527aa..e50ff7d19c 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -29,6 +29,6 @@ Full list of publications from the OPS project can be found in the [Publications
 The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by  contacting the [OP-DSL team](https://op-dsl.github.io/about.html). 
 
 ## Funding
-The development of OPS was in part supported by the UK Engineering and Physical Sciences Research Council (EPSRC) grants [EP/K038567/1](http://gow.epsrc.ac.uk/NGBOViewGrant.aspx?GrantRef=EP/K038567/1) (“Future-proof massively-parallel execution of multi-block applications”), [EP/J010553/1](http://gow.epsrc.ac.uk/NGBOViewGrant.aspx?GrantRef=EP/J010553/1) (“Software for Emerging Architectures - ASEArch"), The UK Turbulence Consortium grant [EP/T026170/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/T026170/1), The Janos Bolyai Research Scholarship of the Hungarian Academy of Sciences, the Royal Society through their Industry Fellowship Scheme (INF/R1/180012), and the Thematic Research Cooperation Establishing Innovative Informatic and Info-communication Solutions Project, which has been supported by the European Union and co-financed by the European Social Fund under grant number EFOP-3.6.2-16-2017-00013. Research funding support was also provided by the UK AWE under grants CDK0660 ("The Production of Predictive Models for Future Computing Requirements"), CDK0724 ("AWE Technical Outreach Programme"), AWE grant for "High-level Abstractions for Performance, Portability and Continuity of Scientific Software on Future Computing Systems" and the Numerical Algorithms Group [NAG](https://www.nag.com/).
+The development of OPS was in part supported by the UK Engineering and Physical Sciences Research Council (EPSRC) grants [EP/K038494/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/K038494/1) (“Future-proof massively-parallel execution of multi-block applications”), [EP/J010553/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/J010553/1) (“Software for Emerging Architectures - ASEArch"), The UK Turbulence Consortium grant [EP/T026170/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/T026170/1), The Janos Bolyai Research Scholarship of the Hungarian Academy of Sciences, the Royal Society through their Industry Fellowship Scheme (INF/R1/180012), and the Thematic Research Cooperation Establishing Innovative Informatic and Info-communication Solutions Project, which has been supported by the European Union and co-financed by the European Social Fund under grant number EFOP-3.6.2-16-2017-00013. Research funding support was also provided by the UK AWE under grants CDK0660 ("The Production of Predictive Models for Future Computing Requirements"), CDK0724 ("AWE Technical Outreach Programme"), AWE grant for "High-level Abstractions for Performance, Portability and Continuity of Scientific Software on Future Computing Systems" and the Numerical Algorithms Group [NAG](https://www.nag.com/).
 
 Hardware resources for development and testing provided by the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725, the [ARCHER](http://www.archer.ac.uk) and ARCHER2(https://www.archer2.ac.uk/) UK National Supercomputing Service, [University of Oxford Advanced Research Computing (ARC) facility](http://dx.doi.org/10.5281/zenodo.22558) and through hardware donations and access provided by NVIDIA and Intel.

From dcdec60469b34f7d79943aac0c1c32ea4d44561e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:12:23 +0100
Subject: [PATCH 061/324] Update setup.py

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 0b0fbbc774..94c56f8368 100644
--- a/setup.py
+++ b/setup.py
@@ -5,8 +5,8 @@
 setup(name='ops',
       version='dev',
       description='OPS is an API with associated libraries and preprocessors to generate parallel executables for applications on mulit-block structured meshes.',
-      author='Mike Giles, Istvan Reguly, Gihan Mudalige, and others',
-      url='http://www.oerc.ox.ac.uk/projects/ops',
+      author='Gihan Mudalige, Istvan Reguly, Mike Giles, and others',
+      url='https://op-dsl.github.io/',
       packages=['ops_translator', 'ops_translator.c', 'ops_translator.fortran'],
       scripts=[],
       classifiers=[

From 92a36729bea05185a7559e532e78eb9c41169e81 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:13:12 +0100
Subject: [PATCH 062/324] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2077d3fd19..781adcf87e 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,6 @@ To cite OPS, please reference the following paper:
 ## Contact
 If you wish to report a bug with the software, please contact the [OP-DSL team](https://op-dsl.github.io/about.html)
 
-## Licence 
+## License 
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
 

From 5d9cfa5a46be31a431011e825d5712fb7d230e5e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:17:35 +0100
Subject: [PATCH 063/324] Update apps.md

---
 doc/apps.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/apps.md b/doc/apps.md
index b023ec04ee..4361e0ffde 100644
--- a/doc/apps.md
+++ b/doc/apps.md
@@ -1,4 +1,5 @@
 # Examples
-## CloverLeaf  (2D, 3D and HDF5)
+## CloverLeaf 2D, 
+## CloverLeaf 3D with HDF5
 ## poisson
 ## adi

From 4d951ac6da10976485aa1119b525ca16856597de Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:17:44 +0100
Subject: [PATCH 064/324] Update apps.md

---
 doc/apps.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/apps.md b/doc/apps.md
index 4361e0ffde..5bee74c5de 100644
--- a/doc/apps.md
+++ b/doc/apps.md
@@ -1,5 +1,5 @@
 # Examples
-## CloverLeaf 2D, 
+## CloverLeaf 2D 
 ## CloverLeaf 3D with HDF5
 ## poisson
 ## adi

From bba65bb92734b74a3e7870e24803666ec1c59852 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:27:03 +0100
Subject: [PATCH 065/324] Create perf.md

---
 doc/perf.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 doc/perf.md

diff --git a/doc/perf.md b/doc/perf.md
new file mode 100644
index 0000000000..3da5366b8a
--- /dev/null
+++ b/doc/perf.md
@@ -0,0 +1 @@
+Performance Tuning

From 652caadada55a9e5936608141d29f1b3bf931bac Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:27:22 +0100
Subject: [PATCH 066/324] Update index.rst

---
 doc/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/index.rst b/doc/index.rst
index f991bca140..39efe81875 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -15,6 +15,7 @@ Welcome to OPS documentation!
    devanapp.md
    opsapi.md
    apps.md
+   perf.md
    devdoc.md
    pubs.md
 

From 2891b3280cda869857538e9d01a70a99337b83da Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:33:14 +0100
Subject: [PATCH 067/324] Update perf.md

---
 doc/perf.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index 3da5366b8a..8b0dccc449 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -1 +1,9 @@
-Performance Tuning
+# Performance Tuning
+
+## Compiler flags for vectorization
+## Cache-blocking Tiling
+## OpenMP with MPI
+## CUDA arguments
+## CUDA-aware MPI
+## OpenCL arguments 
+

From 2e764ba9d5939e3aa384327554eef13631cedb33 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:49:11 +0100
Subject: [PATCH 068/324] Update perf.md

---
 doc/perf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index 8b0dccc449..40a2e43fe9 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -1,6 +1,6 @@
 # Performance Tuning
 
-## Compiler flags for vectorization
+## Vectorization
 ## Cache-blocking Tiling
 ## OpenMP with MPI
 ## CUDA arguments

From a6312722c970ebecbfa093f7757ef8c7f8096832 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 17:01:06 +0100
Subject: [PATCH 069/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index e50ff7d19c..3ccdb1db21 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-OPS is a high-level framework with associated libraries and preprocessors to generate parallel executables for applications on **multi-block structured grids**. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. This document describes the OPS C++ API, which supports the development of single-block and multi-block structured meshes.
+[OPS](https://github.com/OP-DSL/OPS)(Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
 
 ## Licencing
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.

From d5c7d6253b09ff1a6a3132a619132f3c45887518 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 17:03:44 +0100
Subject: [PATCH 070/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 3ccdb1db21..7487a62c9c 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-[OPS](https://github.com/OP-DSL/OPS)(Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
 
 ## Licencing
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.

From d11cd2a8ff0f400c2e8ff36c1ec55a3e9a35647e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 11:41:14 +0100
Subject: [PATCH 071/324] Update README.md

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index 781adcf87e..059e57a2f1 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,17 @@ To cite OPS, please reference the following paper:
 ## Contact
 If you wish to report a bug with the software, please contact the [OP-DSL team](https://op-dsl.github.io/about.html)
 
+## Contributing
+
+To contribute to OPS please use the following steps :
+
+1. Clone this repository (on your local system)
+2. Create a new branch in your cloned repository
+3. Make changes / contributions in your new branch
+4. Submit your changes by creating a Pull Request to the `develop` branch of the OPS repository
+
+The contributions in the `develop` branch will be merged into the master branch as we create a new release.
+
 ## License 
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
 

From c50b89949574c401aea9b8640634cd74ba4f6e3c Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 11:43:30 +0100
Subject: [PATCH 072/324] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 059e57a2f1..9c0555f554 100644
--- a/README.md
+++ b/README.md
@@ -37,8 +37,8 @@ To cite OPS, please reference the following paper:
   doi={10.1109/TPDS.2017.2778161}}
 ```
 
-## Contact
-If you wish to report a bug with the software, please contact the [OP-DSL team](https://op-dsl.github.io/about.html)
+## Support and Contact
+The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by contacting the the [OP-DSL team](https://op-dsl.github.io/about.html).
 
 ## Contributing
 

From 408c21457aeae19e51c9aa071db5674e96f9f087 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 12:15:43 +0100
Subject: [PATCH 073/324] Update opsapi.md

---
 doc/opsapi.md | 54 ++++++++++++++++++---------------------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 408bf9e341..13712c3517 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -1,44 +1,28 @@
 # OPS API
 
+## Overview
+
+Many of the API and library follows the structure of the OP2 high-level library for unstructured mesh
+applications~\cite{op2}.
+
+The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The key idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
+
+To clarify some of the important issues in the API, we note here some needs connected with a 3D application:
+*   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
+*   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
+*   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
+*   In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
+*   In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block. 
+
 ## Key concepts and structure
 
-An OPS application can generally be divided into two key parts:
-initialisation and parallel execution. During the initialisation phase,
-one or more blocks (ops_block) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a
-block, and have a specific size (in each dimension of the block), which
-may be slightly different across different datasets (e.g. staggered
-grids), in some directions they may be degenerate (a size of 1), or they
-can represent data associated with different multigrid levels (where
-their size if a multiple or a fraction of other datasets). Datasets can
-be declared with empty (NULL) pointers, then OPS will allocate the
-appropriate amount of memory, may be passed non-NULL pointers (currently
-only supported in non-MPI environments), in which case OPS will assume
-the memory is large enough for the data and the block halo, and there
-are HDF5 dataset declaration routines which allow the distributed
-reading of datasets from HDF5 files. The concept of blocks is necessary
-to group datasets together, as in a multi-block problem, in a
-distributed memory environment, OPS needs to be able to determine how to
+An OPS application can generally be divided into two key parts: (1) initialisation and (2) parallel execution. During the initialisation phase, one or more blocks ( we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
 decompose the problem.
 
-The initialisation phase usually also consists of defining the stencils
-to be used later on (though they can be defined later as well), which
-describe the data access patterns used in parallel loops. Stencils are
-always relative to the "current" point; e.g. if at iteration $(i,j)$, we
-wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two
-points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in
-one of the dimensions the dataset's size is 1), as well as for
-multigrid, there are special strided, restriction, and prolongation
-stencils: they differ from normal stencils in that as one steps through
-a grid in a parallel loop, the stepping is done with a non-unit stride
-for these datasets. For example, in a 2D problem, if we have a
-degenerate dataset called xcoords, size $(N,1)$, then we will need a
-stencil with stride $(1,0)$ to access it in a regular 2D loop.
-
-Finally, the initialisation phase may declare a number of global
-constants - these are variables in global scope that can be accessed
-from within user kernels, without having to pass them in explicitly.
-These may be scalars or small arrays, generally for values that do not
-change during execution, though they may be updated during execution
+The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration $(i,j)$, we wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size $(N,1)$, then we will need a stencil with stride $(1,0)$ to access it in a regular 2D loop.
+
+Finally, the initialisation phase may declare a number of global constants - these are variables in global scope that can be accessed from within elemental kernels, without having to pass them in explicitly. These may be scalars or small arrays, generally for values that do not change during execution, though they may be updated during execution
 with repeated calls to `ops_decl_const`.
 
 The initialisation phase is terminated by a call to `ops_partition`.

From 33057c6e813f12e8f14f0bdb39744be566f9d51e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 12:43:10 +0100
Subject: [PATCH 074/324] Update opsapi.md

---
 doc/opsapi.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 13712c3517..08cb669984 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -2,9 +2,6 @@
 
 ## Overview
 
-Many of the API and library follows the structure of the OP2 high-level library for unstructured mesh
-applications~\cite{op2}.
-
 The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The key idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
 
 To clarify some of the important issues in the API, we note here some needs connected with a 3D application:

From 93f8dc02e8ba7cd693c4e3db7b4fc42ca72a50b7 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 12:44:59 +0100
Subject: [PATCH 075/324] Update opsapi.md

---
 doc/opsapi.md | 41 +++++++----------------------------------
 1 file changed, 7 insertions(+), 34 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 08cb669984..ca1d41b542 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -4,13 +4,6 @@
 
 The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The key idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
 
-To clarify some of the important issues in the API, we note here some needs connected with a 3D application:
-*   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
-*   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
-*   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
-*   In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
-*   In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block. 
-
 ## Key concepts and structure
 
 An OPS application can generally be divided into two key parts: (1) initialisation and (2) parallel execution. During the initialisation phase, one or more blocks ( we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
@@ -98,34 +91,14 @@ blocks can be quite complex, and in particular they may not be oriented
 in the same way, i.e. an $i,j$ face of one block may correspond to the
 $j,k$ face of another block. This is awkward and hard to handle simply.
 
-To clarify some of the important issues in designing the API, we note
-here some needs connected with a 3D application:
-
--   When looping over the interior with loop indices $i,j,k$, often
-    there are 1D arrays which are referenced using just one of the
-    indices.
-
--   To implement boundary conditions, we often loop over a 2D face,
-    accessing both the 3D dataset and data from a 2D dataset.
-
--   To implement periodic boundary conditions using dummy "halo" points,
-    we sometimes have to copy one plane of boundary data to another.
-    e.g. if the first dimension has size $I$ then we might copy the
-    plane $i=I\!-\!2$ to plane $i=0$, and plane $i=1$ to plane
-    $i=I\!-\!1$.
-
--   In multigrid, we are working with two grids with one having twice as
-    many points as the other in each direction. To handle this we
-    require a stencil with a non-unit stride.
-
--   In multi-block grids, we have several structured blocks. The
-    connectivity between the faces of different blocks can be quite
-    complex, and in particular they may not be oriented in the same way,
-    i.e. an $i,j$ face of one block may correspond to the $j,k$ face of
-    another block. This is awkward and hard to handle simply.
+To clarify some of the important issues in the API, we note here some needs connected with a 3D application:
+*   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
+*   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
+*   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
+*   In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
+*   In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block. 
 
-The latest proposal is to handle all of these different requirements
-through stencil definitions.
+OPS handle all of these different requirements through stencil definitions.
 
 ## OPS C++ API 
 

From d276b893d2bf5001c5d6fd7a125bd922c7aae5cb Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 12:58:26 +0100
Subject: [PATCH 076/324] Update opsapi.md

---
 doc/opsapi.md | 84 ++++++++-------------------------------------------
 1 file changed, 13 insertions(+), 71 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index ca1d41b542..4f13c141e7 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -17,81 +17,23 @@ with repeated calls to `ops_decl_const`.
 
 The initialisation phase is terminated by a call to `ops_partition`.
 
-The bulk of the application consists of parallel loops, implemented
-using calls to `ops_par_loop`. These constructs work with datasets,
-passed through the opaque `ops_dat` handles declared during the
-initialisation phase. The iterations of parallel loops are semantically
-independent, and it is the responsibility of the user to enforce this:
-the order in which iterations are executed cannot affect the result
-(within the limits of floating point precision). Parallel loops are
-defined on a block, with a prescribed iteration range that is always
-defined from the perspective of the dataset written/modified (the sizes
-of datasets, particularly in multigrid situations, may be very
-different). Datasets are passed in using `ops_arg_dat`, and during
-execution, values at the current grid point will be passed to the user
-kernel. These values are passed wrapped in a templated `ACC<>` object
-(templated on the type of the data), whose parentheses operator is
-overloaded, which the user must use to specify the relative offset to
-access the grid point's neighbours (which accesses have to match the the
-declared stencil). Datasets written may only be accessed with a
-one-point, zero-offset stencil (otherwise the parallel semantics may be
-violated).
-
-Other than datasets, one can pass in read-only scalars or small arrays
-that are iteration space invariant with `ops_arg_gbl` (typically
-weights, $\delta t$, etc. which may be different in different loops).
-The current iteration index can also be passed in with `ops_arg_idx`,
-which will pass a globally consistent index to the user kernel (i.e.
+The bulk of the application consists of parallel loops, implemented using calls to `ops_par_loop`. These constructs work with datasets, passed through the opaque `ops_dat` handles declared during the initialisation phase. The iterations of parallel loops are semantically independent, and it is the responsibility of the user to enforce this:
+the order in which iterations are executed cannot affect the result (within the limits of floating point precision). Parallel loops are defined on a block, with a prescribed iteration range that is always defined from the perspective of the dataset written/modified (the sizes of datasets, particularly in multigrid situations, may be very
+different). Datasets are passed in using `ops_arg_dat`, and during execution, values at the current grid point will be passed to the user kernel. These values are passed wrapped in a templated `ACC<>` object (templated on the type of the data), whose parentheses operator is overloaded, which the user must use to specify the relative offset to
+access the grid point's neighbours (which accesses have to match the the declared stencil). Datasets written may only be accessed with a one-point, zero-offset stencil (otherwise the parallel semantics may be violated).
+
+Other than datasets, one can pass in read-only scalars or small arrays that are iteration space invariant with `ops_arg_gbl` (typically weights, $\delta t$, etc. which may be different in different loops). The current iteration index can also be passed in with `ops_arg_idx`, which will pass a globally consistent index to the user kernel (i.e.
 also under MPI).
 
-Reductions in loops are done using the ops_arg_reduce argument, which
-takes a reduction handle as an argument. The result of the reduction can
-then be acquired using a separate call to `ops_reduction_result`. The
-semantics are the following: a reduction handle after it was declared is
-in an "uninitialised" state. The first time it is used as an argument to
-a loop, its type is determined (increment/min/max), and is initialised
-appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in
-parallel loops are combined together, up until the point, where the
-result is acquired using `ops_reduction_result`, which then sets it back
-to an uninitialised state. This also implies, that different parallel
-loops, which all use the same reduction handle, but are otherwise
-independent, are independent and their partial reduction results can be
-combined together associatively and commutatively.
-
-OPS takes responsibility for all data, its movement and the execution of
-parallel loops. With different execution hardware and optimisations,
-this means OPS will re-organise data as well as execution (potentially
-across different loops), and therefore any data accesses or manipulation
-may only be done through the OPS API.
-
-This restriction is exploited by a lazy execution mechanism in OPS. The
-idea is that OPS API calls that do not return a result can be not
-executed immediately, rather queued, and once an API call requires
-returning some data, operations in the queue are executed, and the
-result is returned. This allows OPS to analyse and optimise operations
-in the queue together. This mechanism is fully automated by OPS, and is
-used with the various \_tiled executables. For more information on how
-to use this mechanism for improving CPU performance, see Section
-[\[sec:tiling\]](#sec:tiling){reference-type="ref"
-reference="sec:tiling"}. Some API calls triggering the execution of
-queued operations include ops_reduction_result, and the functions in the
-data access API.
+Reductions in loops are done using the `ops_arg_reduce` argument, which takes a reduction handle as an argument. The result of the reduction can then be acquired using a separate call to `ops_reduction_result`. The semantics are the following: a reduction handle after it was declared is in an "uninitialised" state. The first time it is used as an argument to a loop, its type is determined (increment/min/max), and is initialised appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in parallel loops are combined together, up until the point, where the result is acquired using `ops_reduction_result`, which then sets it back to an uninitialised state. This also implies, that different parallel loops, which all use the same reduction handle, but are otherwise independent, are independent and their partial reduction results can be combined together associatively and commutatively.
 
+OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **any data accesses or manipulation must only be done through the OPS API**. 
+
+This restriction is exploited by a lazy execution mechanism in OPS. The idea is that OPS API calls that do not return a result need not be executed immediately, rather queued, and once an API call requires returning some data, operations in the queue are executed, and the result is returned. This allows OPS to analyse and optimise operations
+in the queue together. This mechanism is fully automated by OPS, and is used with the various `_tiled` executables. For more information on how to use this mechanism for improving CPU performance, see Section on Tiling. Some API calls triggering the execution of queued operations include `ops_reduction_result`, and the functions in the
+data access API.
 
-Many of the API and library follows the structure of the OP2 high-level
-library for unstructured mesh applications [@op2]. However the
-structured mesh domain is distinct from the unstructured mesh
-applications domain due to the implicit connectivity between
-neighbouring mesh elements (such as vertices, cells) in structured
-meshes/grids. The key idea is that operations involve looping over a
-"rectangular" multi-dimensional set of grid points using one or more
-"stencils" to access data. In multi-block grids, we have several
-structured blocks. The connectivity between the faces of different
-blocks can be quite complex, and in particular they may not be oriented
-in the same way, i.e. an $i,j$ face of one block may correspond to the
-$j,k$ face of another block. This is awkward and hard to handle simply.
-
-To clarify some of the important issues in the API, we note here some needs connected with a 3D application:
+To clarify some of the important issues encountered when designing the OPS API, we note here some needs connected with a 3D application:
 *   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
 *   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
 *   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.

From d403507aa5acde3b67c68c462553b91f84f8b65d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 12:59:48 +0100
Subject: [PATCH 077/324] Update opsapi.md

---
 doc/opsapi.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 4f13c141e7..f5fb04fe29 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -4,7 +4,7 @@
 
 The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The key idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
 
-## Key concepts and structure
+## Key Concepts and Structure
 
 An OPS application can generally be divided into two key parts: (1) initialisation and (2) parallel execution. During the initialisation phase, one or more blocks ( we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
 decompose the problem.
@@ -33,7 +33,7 @@ This restriction is exploited by a lazy execution mechanism in OPS. The idea is
 in the queue together. This mechanism is fully automated by OPS, and is used with the various `_tiled` executables. For more information on how to use this mechanism for improving CPU performance, see Section on Tiling. Some API calls triggering the execution of queued operations include `ops_reduction_result`, and the functions in the
 data access API.
 
-To clarify some of the important issues encountered when designing the OPS API, we note here some needs connected with a 3D application:
+To further clarify some of the important issues encountered when designing the OPS API, we note here some needs connected with a 3D application:
 *   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
 *   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
 *   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.

From 5df825858605464cc5e296aa4e50fbc9662bb69a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 13:15:02 +0100
Subject: [PATCH 078/324] Update opsapi.md

---
 doc/opsapi.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index f5fb04fe29..a2771d3dbf 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -43,4 +43,10 @@ To further clarify some of the important issues encountered when designing the O
 OPS handle all of these different requirements through stencil definitions.
 
 ## OPS C++ API 
-
+### Initialisation declaration and termination routines
+### Diagnostic and output routines
+### Halo exchange
+### Parallel loop syntax
+### Stencils
+### Checkpointing
+### Access to OPS data

From 377396144f72802c2a9fa642a7feffb206d92647 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 13:17:37 +0100
Subject: [PATCH 079/324] Update opsapi.md

---
 doc/opsapi.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index a2771d3dbf..abeed444cf 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -43,7 +43,8 @@ To further clarify some of the important issues encountered when designing the O
 OPS handle all of these different requirements through stencil definitions.
 
 ## OPS C++ API 
-### Initialisation declaration and termination routines
+### Initialisation and termination routines
+### Declaration routines
 ### Diagnostic and output routines
 ### Halo exchange
 ### Parallel loop syntax

From 286d6dbde766907761cd571eaf05ab49d4705adf Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:31:54 +0100
Subject: [PATCH 080/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 7487a62c9c..ac9e9b2da0 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
 
 ## Licencing
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.

From 71ebcc2eefb633057f8f28caabda1de3d60fab61 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:33:10 +0100
Subject: [PATCH 081/324] Update README.md

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9c0555f554..bec553a5ac 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # OPS
 
-OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
+OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran.
+
 
 [![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/pipeline.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
 [![Documentation Status](https://readthedocs.org/projects/ops-dsl/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)

From 83fbaac6bc06e70c8e2df93046a8caab0946b658 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:34:01 +0100
Subject: [PATCH 082/324] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bec553a5ac..89ddd03dd3 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # OPS
 
-OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran.
+OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes. The OPS API is embedded in C/C++ and Fortran.
 
 
 [![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/pipeline.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 

From 8438143b896268fd1d3df8ed2a0931f983151d87 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:37:43 +0100
Subject: [PATCH 083/324] Update opsapi.md

---
 doc/opsapi.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index abeed444cf..9bf6144ebb 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The key idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
+The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The main idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
 
 ## Key Concepts and Structure
 

From e29ed12384c7d97ec609eef337807d69ed00cfbd Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:42:44 +0100
Subject: [PATCH 084/324] Update opsapi.md

---
 doc/opsapi.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 9bf6144ebb..956e5c7323 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -6,7 +6,7 @@ The key characteristic of structured mesh applications is the implicit connectiv
 
 ## Key Concepts and Structure
 
-An OPS application can generally be divided into two key parts: (1) initialisation and (2) parallel execution. During the initialisation phase, one or more blocks ( we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
+The OPS API allows to declare a computation over such multi-block structured meshes. An OPS application can generally be declared in two key parts: (1) initialisation and (2) iteration over the mesh (carried out as a parallel loop). During the initialisation phase, one or more blocks (we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
 decompose the problem.
 
 The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration $(i,j)$, we wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride

From f6c3b648f3f414819fe0d0e81392b9a917cf6dc9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:47:21 +0100
Subject: [PATCH 085/324] Update opsapi.md

---
 doc/opsapi.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 956e5c7323..3e884e3589 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The main idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
+The key characteristic of structured mesh applications is the implicit connectivity between neighboring mesh elements (such as vertices, cells). The main idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
 
 ## Key Concepts and Structure
 

From 568d189acee342ef4087d6605eaa83a5490388eb Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Wed, 29 Sep 2021 12:42:19 +0100
Subject: [PATCH 086/324] Fix a few violation to good markdown rules

---
 doc/opsapi.md | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 3e884e3589..5b15f651f1 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -27,27 +27,36 @@ also under MPI).
 
 Reductions in loops are done using the `ops_arg_reduce` argument, which takes a reduction handle as an argument. The result of the reduction can then be acquired using a separate call to `ops_reduction_result`. The semantics are the following: a reduction handle after it was declared is in an "uninitialised" state. The first time it is used as an argument to a loop, its type is determined (increment/min/max), and is initialised appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in parallel loops are combined together, up until the point, where the result is acquired using `ops_reduction_result`, which then sets it back to an uninitialised state. This also implies, that different parallel loops, which all use the same reduction handle, but are otherwise independent, are independent and their partial reduction results can be combined together associatively and commutatively.
 
-OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **any data accesses or manipulation must only be done through the OPS API**. 
+OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **any data accesses or manipulation must only be done through the OPS API**.
 
 This restriction is exploited by a lazy execution mechanism in OPS. The idea is that OPS API calls that do not return a result need not be executed immediately, rather queued, and once an API call requires returning some data, operations in the queue are executed, and the result is returned. This allows OPS to analyse and optimise operations
 in the queue together. This mechanism is fully automated by OPS, and is used with the various `_tiled` executables. For more information on how to use this mechanism for improving CPU performance, see Section on Tiling. Some API calls triggering the execution of queued operations include `ops_reduction_result`, and the functions in the
 data access API.
 
 To further clarify some of the important issues encountered when designing the OPS API, we note here some needs connected with a 3D application:
-*   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
-*   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
-*   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
-*   In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
-*   In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block. 
+
+* When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
+* To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
+* To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
+* In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
+* In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block.
 
 OPS handle all of these different requirements through stencil definitions.
 
-## OPS C++ API 
+## C/C++ API
+
 ### Initialisation and termination routines
+
 ### Declaration routines
+
 ### Diagnostic and output routines
+
 ### Halo exchange
+
 ### Parallel loop syntax
+
 ### Stencils
+
 ### Checkpointing
+
 ### Access to OPS data

From b7b4038cbf81b39bbf0fe2c24fb80e7045c3fafc Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 1 Oct 2021 13:30:50 +0100
Subject: [PATCH 087/324] Update opsapi.md

---
 doc/opsapi.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 3e884e3589..d27f7a065e 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -9,8 +9,8 @@ The key characteristic of structured mesh applications is the implicit connectiv
 The OPS API allows to declare a computation over such multi-block structured meshes. An OPS application can generally be declared in two key parts: (1) initialisation and (2) iteration over the mesh (carried out as a parallel loop). During the initialisation phase, one or more blocks (we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
 decompose the problem.
 
-The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration $(i,j)$, we wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
-for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size $(N,1)$, then we will need a stencil with stride $(1,0)$ to access it in a regular 2D loop.
+The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration *(i,j)*, we wish to access *(i-1,j)* and *(i,j)*, then the stencil will have two points: *{(-1, 0), (0, 0)}*. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size *(N,1)*, then we will need a stencil with stride *(1,0)* to access it in a regular 2D loop.
 
 Finally, the initialisation phase may declare a number of global constants - these are variables in global scope that can be accessed from within elemental kernels, without having to pass them in explicitly. These may be scalars or small arrays, generally for values that do not change during execution, though they may be updated during execution
 with repeated calls to `ops_decl_const`.

From 1fba5f90b13ff1b2615372dd089fac9dc28be755 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 1 Oct 2021 13:44:47 +0100
Subject: [PATCH 088/324] Update opsapi.md

---
 doc/opsapi.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index d27f7a065e..b046951bac 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -9,8 +9,8 @@ The key characteristic of structured mesh applications is the implicit connectiv
 The OPS API allows to declare a computation over such multi-block structured meshes. An OPS application can generally be declared in two key parts: (1) initialisation and (2) iteration over the mesh (carried out as a parallel loop). During the initialisation phase, one or more blocks (we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
 decompose the problem.
 
-The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration *(i,j)*, we wish to access *(i-1,j)* and *(i,j)*, then the stencil will have two points: *{(-1, 0), (0, 0)}*. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
-for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size *(N,1)*, then we will need a stencil with stride *(1,0)* to access it in a regular 2D loop.
+The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration $(i,j)$, we wish to access $(i-1,j)$ and $(i,j)$, then the stencil will have two points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size $(N,1)$, then we will need a stencil with stride $(1,0)$ to access it in a regular 2D loop.
 
 Finally, the initialisation phase may declare a number of global constants - these are variables in global scope that can be accessed from within elemental kernels, without having to pass them in explicitly. These may be scalars or small arrays, generally for values that do not change during execution, though they may be updated during execution
 with repeated calls to `ops_decl_const`.

From ebbcd3042dd6a74572250437b8ccb5155cca32ba Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sat, 9 Oct 2021 21:36:58 +0100
Subject: [PATCH 089/324] Port latex doc to markdown

---
 doc/perf.md | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index 40a2e43fe9..be069ba51b 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -1,9 +1,56 @@
 # Performance Tuning
 
 ## Vectorization
+
+## Executing with GPUDirect
+
+GPU direct support for MPI+CUDA, to enable (on the OPS side) add
+**-gpudirect** when running the executable. You may also have to use
+certain environmental flags when using different MPI distributions. For
+an example of the required flags and environmental settings on the
+Cambridge Wilkes2 GPU cluster see:\
+<https://docs.hpc.cam.ac.uk/hpc/user-guide/performance-tips.html>
 ## Cache-blocking Tiling
+OPS has a code generation (ops_gen_mpi_lazy) and build target for
+tiling. Once compiled, to enable, use the `OPS_TILING` runtime parameter. This will look at the L3 cache size of your CPU and guess the correct
+tile size. If you want to alter the amount of cache to be used for the
+guess, use the ``OPS_CACHE_SIZE=XX`` runtime parameter, where the value is
+in Megabytes. To manually specify the tile sizes, use the
+``OPS_TILESIZE_X``, ``OPS_TILESIZE_Y``, and ``OPS_TILESIZE_Z`` runtime arguments.
+
+When MPI is combined with OpenMP tiling can be extended to the MPI
+halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
+halos for multiple `ops_par_loops` can be exchanged with a single MPI
+message (see [@TPDS2017] for more details)\
+To test, compile CloverLeaf under ``apps/c/CloverLeaf``, modify clover.in
+to use a $6144^2$ mesh, then run as follows:\
+For OpenMP with tiling:
+```bash
+export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING
+```
+For MPI+OpenMP with tiling:
+```bash
+export OMP_NUM_THREADS=xx; mpirun -np xx ./cloverleaf_mpi_tiled OPS_TILING OPS_TILING_MAXDEPTH=6
+```
+To manually specify the tile sizes (in number of grid points), use the
+OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:
+```bash
+export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200
+```
 ## OpenMP with MPI
 ## CUDA arguments
+The CUDA (and OpenCL) thread block sizes can be controlled by setting
+the ``OPS_BLOCK_SIZE_X``, ``OPS_BLOCK_SIZE_Y`` and ``OPS_BLOCK_SIZE_Z`` runtime
+arguments. For example,
+```bash
+./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4
+```
 ## CUDA-aware MPI
-## OpenCL arguments 
+## OpenCL arguments
+
+`OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the
+code on.
+
+Usually `OPS_CL_DEVICE=0` selects the CPU and `OPS_CL_DEVICE=1` selects
+GPUs.
 

From a82ce4b40cc238677020eee1738cf3662fdde514 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sat, 9 Oct 2021 21:37:46 +0100
Subject: [PATCH 090/324] port latex doc (API) to markdown

---
 doc/opsapi.md | 618 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 618 insertions(+)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 5b15f651f1..f9c91351fb 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -47,16 +47,634 @@ OPS handle all of these different requirements through stencil definitions.
 
 ### Initialisation and termination routines
 
+#### ops_init
+
+__void ops_init(int argc, char** argv, int diags_level)__
+
+This routine must be called before all other OPS routines
+
+| Arguments      | Description |
+| ----------- | ----------- |
+| argc, argv      | the usual command line arguments      |
+| diags_level   |  an integer which defines the level of debugging diagnostics and reporting to be performed |
+
+Currently, higher diags_levels does the following checks
+
+`diags_level` $=$ 1 : no diagnostics, default to achieve best runtime
+performance.
+
+`diags_level` $>$ 1 : print block decomposition and `ops_par_loop`
+timing breakdown.
+
+`diags_level` $>$ 4 : print intra-block halo buffer allocation feedback
+(for OPS internal development only)
+
+`diags_level` $>$ 5 : check if intra-block halo MPI sends depth match
+MPI receives depth (for OPS internal development only)
+
+#### ops_exit
+
+__void ops_exit()__
+
+This routine must be called last to cleanly terminate the OPS computation.
+
 ### Declaration routines
 
+#### ops_decl_block
+
+__ops_block ops_decl_block(int dims, char *name)__
+
+This routine defines a structured grid block.
+| Arguments      | Description |
+| ----------- | ----------- |
+| dims    | dimension of the block    |
+| name  |  a name used for output diagnostics |
+
+#### ops_decl_block_hdf5
+
+__ops_block ops_decl_block_hdf5(int dims, char *name, char *file)__
+
+This routine reads the details of a structured grid block from a named HDF5 file
+
+| Arguments      | Description |
+| ----------- | ----------- |
+| dims    | dimension of the block    |
+| name  |  a name used for output diagnostics |
+| file |hdf5 file to read and obtain the block information from|
+
+Although this routine does not read in any extra information about the
+block from the named HDF5 file than what is already specified in the
+arguments, it is included here for error checking (e.g. check if blocks
+defined in an HDF5 file is matching with the declared arguments in an
+application) and completeness.
+
+#### ops_decl_dat
+
+__ops_dat ops_decl_dat(ops block block, int dim, int *size, int *base, int *dm, int *d p, T *data, char *type, char *name)__
+
+This routine defines a dataset.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+block   |      structured block |
+dim     |      dimension of dataset (number of items per grid element) |
+size    |  size in each dimension of the block |
+base    |  base indices in each dimension of the block |
+d_m    |  padding from the face in the negative direction for each dimension (used for block halo) |
+d_p    |  padding from the face in the positive direction for each dimension (used for block halo) |
+data    |     input data of type *T* |
+type     |     the name of type used for output diagnostics (e.g. ``double``,``float``)|
+name     |     a name used for output diagnostics|
+
+The `size` allows to declare different sized data arrays on a given
+`block`. `d_m` and `d_p` are depth of the "block halos" that are used to
+indicate the offset from the edge of a block (in both the negative and
+positive directions of each dimension).
+
+#### ops_decl_dat_hdf5
+
+__ops_dat ops_decl_dat_hdf5(ops_block block, int dim, char *type, char *name, char *file)__
+
+This routine defines a dataset to be read in from a named hdf5 file
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|block  |   structured block|
+|dim     |  dimension of dataset (number of items per grid element)|
+type    |  the name of type used for output diagnostics (e.g. ``double``,``float``)|
+|name   |   name of the dat used for output diagnostics|
+|file   |   hdf5 file to read and obtain the data from|
+
+#### ops_decl_const
+
+__void ops_decl_const(char const * name, int dim, char const * type, T * data )__
+
+This routine defines a global constant: a variable in global scope. Global constants need to be declared upfront
+ so that they can be correctly handled for different parallelizations. For e.g CUDA on GPUs. Once defined
+ they remain unchanged throughout the program, unless changed by a call to ops_update_const(..). The ``name'' and``type''
+ parameters **must** be string literals since they are used in the code generation step
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|name |         a name used to identify the constant |
+|dim |           dimension of dataset (number of items per element) |
+|type |          the name of type used for output diagnostics (e.g. ``double'',``float'') |
+|data |          pointer to input data of type *T* |
+
+#### ops_decl_halo
+
+__ops_halo ops_decl_halo(ops_dat from, ops_dat to, int *iter_size, int* from_base, int *to_base, int *from_dir, int *to_dir)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|from | origin dataset |
+|to|  destination dataset |
+|item_size |  defines an iteration size (number of indices to iterate over in each direction) |
+|from_base |  indices of starting point in \"from\" dataset|
+|to_base | indices of starting point in \"to\" dataset |
+|from_dir | direction of incrementing for \"from\" for each dimension of `iter_size` |
+|to_dir |  direction of incrementing for \"to\" for each dimension of `iter_size`|
+
+A from_dir \[1,2\] and a to_dir \[2,1\] means that x in the first block
+goes to y in the second block, and y in first block goes to x in second
+block. A negative sign indicates that the axis is flipped. (Simple
+example: a transfer from (1:2,0:99,0:99) to (-1:0,0:99,0:99) would use
+iter_size = \[2,100,100\], from_base = \[1,0,0\], to_base = \[-1,0,0\],
+from_dir = \[0,1,2\], to_dir = \[0,1,2\]. In more complex case this
+allows for transfers between blocks with different orientations.)
+
+#### ops_decl_halo_hdf5
+
+__ops_halo ops_decl_halo_hdf5(ops_dat from, ops_dat to, char* file)__
+
+This routine reads in a halo relationship between two datasets defined on two different blocks from a named HDF5 file
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|from|      origin dataset|
+|to|        destination dataset|
+|file|      hdf5 file to read and obtain the data from|
+
+#### ops_decl_halo_group
+
+__ops_halo_group ops_decl_halo_group(int nhalos, ops_halo *halos)__
+
+This routine defines a collection of halos. Semantically, when an exchange is triggered for all halos in a group, there is no order defined in which they are carried out.
+| Arguments      | Description |
+| ----------- | ----------- |
+|nhalos|         number of halos in *halos* |
+|halos|           array of halos|
+
+#### ops_decl_reduction_handle}
+
+__ops_reduction ops_decl_reduction_handle(int size, char *type, char *name)__
+This routine defines a reduction handle to be used in a parallel loop
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|size|      size of data in bytes |
+|type|          the name of type used for output diagnostics (e.g. ``double'',``float'') |
+|name|          name of the dat used for output diagnostics|
+
+__{void ops_reduction_result(ops_reduction handle, T *result)
+{This routine returns the reduced value held by a reduction handle. When OPS uses lazy execution, this will trigger the execution of all previously queued OPS operations.}
+
+|handle|  the *ops_reduction* handle |
+|result|  a pointer to write the results to, memory size has to match the declared |
+
+#### ops_partition
+
+__ops_partition(char *method)__
+
+Triggers a multi-block partitioning across a distributed memory set of processes. (links to a dummy function for single node parallelizations). This routine should only be called after all the ops_halo ops_decl_block
+and ops_halo ops_decl_dat statements have been declared
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|method|        string describing the partitioning method. Currently this string is not used internally, but is simply a place-holder to indicate different partitioning methods in the future. |
+
 ### Diagnostic and output routines
 
+#### ops_diagnostic_output
+
+__void ops_diagnostic_output()__
+
+This routine prints out various useful bits of diagnostic info about sets, mappings and datasets. Usually used right
+after an ops_partition() call to print out the details of the decomposition
+
+#### ops_printf
+
+__void ops_printf(const char * format, ...)__
+
+This routine simply prints a variable number of arguments; it is created is in place of the standard C
+printf function which would print the same on each MPI process
+
+#### ops_timers
+
+__void ops_timers(double *cpu, double *et)__
+ gettimeofday() based timer to start/end timing blocks of code
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|cpu|  variable to hold the CPU time at the time of invocation|
+|et| variable to hold the elapsed time at the time of invocation|
+
+#### ops_fetch_block_hdf5_file
+
+__void ops_fetch_block_hdf5_file(ops_block block, char *file)__
+
+Write the details of an ops_block to a named HDF5 file. Can be used over MPI (puts the data in an ops_dat into an
+HDF5 file using MPI I/O)
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|block|  ops_block to be written|
+|file|     hdf5 file to write to|
+
+#### ops_fetch_stencil_hdf5_file
+
+__void ops_fetch_stencil_hdf5_file(ops_stencil stencil, char *file)__
+
+Write the details of an ops_block to a named HDF5 file. Can be used over MPI (puts the data in an ops_dat into an HDF5 file using MPI I/O)
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|stencil|  ops_stencil to be written
+|file|     hdf5 file to write to
+
+#### ops_fetch_dat_hdf5_file
+
+__void ops_fetch_dat_hdf5_file(ops_dat dat, const char *file)__
+
+Write the details of an ops_block to a named HDF5 file. Can be used over MPI (puts the data in an ops_dat into an
+HDF5 file using MPI I/O)
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|  ops_dat to be written|
+|file|     hdf5 file to write to|
+
+#### ops_print_dat_to_txtfile
+
+__void ops_print_dat_to_txtfile(ops_dat dat, chat *file)__
+Write the details of an ops_block to a named text file. When used under an MPI parallelization each MPI process
+will write its own data set separately to the text file. As such it does not use MPI I/O. The data can be viewed using
+a simple text editor
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|  ops_dat to to be written|
+|file|     text file to write to|
+
+#### ops_timing_output}
+
+__void ops_timing_output(FILE *os)__
+
+Print OPS performance performance details to output stream
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|os|    output stream, use stdout to print to standard out|
+
+#### ops_NaNcheck}
+
+__void ops_NaNcheck(ops_dat dat)__
+
+Check if any of the values held in the \texttt{dat} is a NaN. If a NaN
+is found, prints an error message and exits.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|  ops_dat to to be checked|
+
 ### Halo exchange
 
+#### ops_halo_transfer
+
+__void ops_halo_transfer(ops_halo_group group)__
+
+This routine exchanges all halos in a halo group and will block execution of subsequent computations that depend on
+the exchanged data.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|group|         the halo group|
+
 ### Parallel loop syntax
 
+A parallel loop with N arguments has the following syntax:
+
+#### ops_par_loop
+
+__void ops_par_loop(\ void (*kernel)(...),char *name, ops_block block, int dims, int *range, ops_arg arg1,ops_arg arg2, ..., ops_arg argN )__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|kernel|     user's kernel function with N arguments|
+|name|       name of kernel function, used for output diagnostics|
+|block|      the ops_block over which this loop executes|
+|dims|       dimension of loop iteration|
+|range|      iteration range array|
+|args|       arguments|
+
+The {\bf ops_arg} arguments in {\bf ops_par_loop} are provided by one of the
+following routines, one for global constants and reductions, and the other
+for OPS datasets.
+
+#### ops_arg_gbl
+
+__ops_arg ops_arg_gbl(T *data, int dim, char *type, ops_access acc)__
+
+Passes a scalar or small array that is invariant of the iteration space (not to be confused with ops_decl_const, which facilitates global scope variables).
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|data|       data array|
+|dim|        array dimension|
+|type|       string representing the type of data held in data|
+|acc|        access type|
+
+#### ops_arg_reduce
+
+__ops_arg ops_arg_reduce(ops_reduction handle, int dim, char *type, ops_access acc)__
+
+Passes a pointer to a variable that needs to be incremented (or swapped for min/max reduction) by the user kernel.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|handle|       an  *ops_reduction* handle|
+|dim|        array dimension (according to *type*)|
+|type|       string representing the type of data held in data|
+|acc|        access type|
+
+#### ops_arg_dat
+
+__ops_arg ops_arg_dat(ops_dat dat, ops_stencil stencil, char *type,ops_access acc)__
+
+Passes a pointer wrapped in ac ACC<> object to the value(s) at the current grid point to the user kernel. The ACC object's parentheses operator has to be used for dereferencing the pointer.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|        dataset|
+|stencil|    stencil for accessing data|
+|type|       string representing the type of data held in dataset|
+|acc|        access type|
+
+#### ops_arg_idx
+
+__ops_arg ops_arg_idx()__
+
+Give you an array of integers (in the user kernel) that have the index of
+the current grid point, i.e. idx[0] is the index in x, idx[1] is the index in y, etc. This is a globally consistent
+index, so even if the block is  distributed across different MPI partitions, it gives you the same indexes. Generally
+used to generate initial geometry.
+
 ### Stencils
 
+The final ingredient is the stencil specification, for which we have two versions: simple and strided.
+
+#### ops_decl_stencil
+
+__ops_stencil ops_decl_stencil(int dims,int points, int *stencil, char *name)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dims|     dimension of loop iteration|
+|points|   number of points in the stencil|
+|stencil|  stencil for accessing data|
+|name| string representing the name of the stencil|
+
+#### ops_decl_strided_stencil
+
+__ops_stencil ops_decl_strided_stencil(int dims, int points, int *stencil, int *stride, char *name)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dims|       dimension of loop iteration|
+|points|     number of points in the stencil|
+|stencil|    stencil for accessing data|
+|stride|     stride for accessing data|
+|name| string representing the name of the stencil|
+
+#### ops_decl_stencil_hdf5
+
+__ops_stencil ops_decl_stencil_hdf5(int dims,int points, char *name, char* file)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dims|     dimension of loop iteration|
+|points|   number of points in the stencil|
+|name|     string representing the name of the stencil|
+|file|     hdf5 file to write to|
+
+ In the strided case, the semantics for the index of data to be
+accessed, for stencil point*p*, in dimension *m* are defined as
+
+```c++
+ stride[m]*loop_index[m] + stencil[p*dims+m]
+```
+
+ where ``loop_index[m]`` is the iteration index (within the
+user-defined iteration space) in the different dimensions.
+
+If, for one or more dimensions, both ``stride[m]`` and
+``stencil[p*dims+m]`` are zero, then one of the following must be true;
+
+* the dataset being referenced has size 1 for these dimensions
+
+* these dimensions are to be omitted and so the dataset has
+dimension equal to the number of remaining dimensions.
+
+See *OPS/apps/c/CloverLeaf/build_field.cpp* and *OPS/apps/c/CloverLeaf/generate.cpp* for an example *ops_decl_strided_stencil* declaration and its use in a loop,respectively.
+
+These two stencil definitions probably take care of all of the
+cases in the Introduction except for multiblock applications with interfaces
+with different orientations -- this will need a third, even more general,
+stencil specification. The strided stencil will handle both multigrid
+(with a stride of 2 for example) and the boundary condition and reduced
+dimension applications (with a stride of 0 for the relevant dimensions).
+
 ### Checkpointing
 
+OPS supports the automatic checkpointing of applications. Using the API below, the user specifies the file name for the
+checkpoint and an average time interval between checkpoints, OPS will then automatically save all necessary information
+periodically that is required to fast-forward to the last checkpoint if a crash occurred. Currently, when re-launching
+after a crash, the same number of MPI processes have to be used. To enable checkpointing mode, the *OPS_CHECKPOINT* runtime argument has to be used.
+
+#### ops_checkpointing_init
+
+__bool ops_checkpointing_init(const char *filename, double interval, int options)__
+
+Initialises the checkpointing system, has to be called after {\tt ops_partition}. Returns true if the application launches in restore
+mode, false otherwise.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|filename| name of the file for checkpointing. In MPI, this will automatically be post-fixed with the rank ID.|
+|interval| average time (seconds) between checkpoints|
+|options| a combinations of flags, listed in *ops_checkpointing.h*, also see below|
+
+* OPS_CHECKPOINT_INITPHASE - indicates that there are a number of parallel loops at the very beginning of the simulations which should be excluded from any checkpoint; mainly because they initialise datasets that do not change during the main body of the execution. During restore mode these loops are executed as usual. An example would be the computation of the mesh geometry, which can be excluded from the checkpoint if it is re-computed when recovering and restoring a checkpoint. The API call *void ops_checkpointing_initphase_done()* indicates the end of this initial phase.
+
+* OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually controls the location of the checkpoint, and explicitly specifies the list of \texttt{ops_dat}s to be saved.
+
+* OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the location of the checkpoint, and it also enables fast-forwarding, by skipping the execution of the
+application (even though none of the parallel loops would actually execute, there may be significant work outside of those) up to the checkpoint
+
+* OPS_CHECKPOINT_MANUAL - Indicates that when the corresponding API function is called, the checkpoint should be created. Assumes the presence of the above two options as well.
+
+#### ops_checkpointing_manual_datlist
+
+__void ops_checkpointing_manual_datlist(int ndats, ops_dat *datlist)__
+
+A user can call this routine at a point in the code to mark the location of a checkpoint.  At this point, the list of datasets specified
+will be saved. The validity of what is saved is not checked by the checkpointing algorithm assuming that the user knows
+what data sets to be saved for full recovery. This routine should be called frequently (compared to check-pointing
+frequency) and it will trigger the creation of the checkpoint the first time it is called after the timeout occurs.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndats| number of datasets to be saved|
+|datlist| arrays of *ops_dat* handles to be saved|
+
+#### ops_checkpointing_fastfw
+
+__bool ops_checkpointing_fastfw(int nbytes, char *payload)__
+
+A use can call this routine at a point in the code to mark the location of a checkpoint.  At this point, the
+specified payload (e.g. iteration count, simulation time, etc.) along with the necessary datasets, as determined by the
+checkpointing algorithm will be saved. This routine should be called frequently (compared to checkpointing frequency),
+will trigger the creation of the checkpoint the first time it is called after the timeout occurs. In restore mode,
+will restore all datasets the first time it is called, and returns true indicating that the saved payload is returned
+in payload. Does not save reduction data.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|nbytes| size of the payload in bytes|
+|payload| pointer to memory into which the payload is packed|
+
+#### ops_checkpointing_manual_datlist_fastfw
+
+__bool ops_checkpointing_manual_datlist_fastfw(int ndats, op_dat *datlist, int nbytes, char *payload)__
+
+Combines the manual datlist and fastfw calls.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndats| number of datasets to be saved|
+|datlist| arrays of *ops_dat* handles to be saved|
+|nbytes| size of the payload in bytes|
+|payload| pointer to memory into which the payload is packed|
+
+#### ops_checkpointing_manual_datlist_fastfw_trigger
+
+__bool ops_checkpointing_manual_datlist_fastfw_trigger(int ndats, opa_dat *datlist, int
+nbytes, char *payload)__
+
+With this routine it is possible to manually trigger checkpointing, instead of relying on the timeout process. as such
+it combines the manual datlist and fastfw calls, and triggers the creation of a checkpoint when called.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndats| number of datasets to be saved|
+|datlist| arrays of *ops_dat* handles to be saved|
+|nbytes| size of the payload in bytes|
+|payload| pointer to memory into which the payload is packed|
+
+\noindent The suggested use of these \textbf{manual} functions is of course when the optimal location for checkpointing
+is known - one of the ways to determine that is to use the built-in algorithm. More details of this will be reported
+in a tech-report on checkpointing, to be published later.
+
 ### Access to OPS data
+
+his section describes APIS that give the user access to internal data structures in OPS and return data to user-space. These should be used cautiously and sparsely, as they can affect performance significantly
+
+#### ops_dat_get_local_npartitions
+
+__int ops_dat_get_local_npartitions(ops_dat dat)__
+
+This routine returns the number of chunks of the given dataset held by the current process.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+
+#### ops_dat_get_global_npartitions}
+
+__int ops_dat_get_global_npartitions(ops_dat dat)}
+{This routine returns the number of chunks of the given dataset held by all processes.}
+|dat|         the dataset
+
+#### ops_dat_get_extents
+
+__void ops_dat_get_extents(ops_dat dat, int part, int *disp, int *sizes)__
+
+This routine returns the MPI displacement and size of a given chunk of the given dataset on the current process.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|disp|        an array populated with the displacement of the chunk within the ``global'' distributed array|
+|sizes|       an array populated with the spatial extents|
+
+#### ops_dat_get_raw_metadata
+
+__char* ops_dat_get_raw_metadata(ops_dat dat, int part, int *disp, int *size, int *stride, int *d_m, int *d_p)__
+
+This routine returns array shape metadata corresponding to the ops_dat. Any of the arguments that are not of interest, may be NULL.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|disp|        an array populated with the displacement of the chunk within the ``global'' distributed array|
+|size|       an array populated with the spatial extents
+|stride|      an array populated strides in spatial dimensions needed for column-major indexing|
+|d_m|      an array populated with padding on the left in each dimension. Note that these are negative values|
+|d_p|      an array populated with padding on the right in each dimension|
+
+#### ops_dat_get_raw_pointer
+
+__char* ops_dat_get_raw_pointer(ops_dat dat, int part, ops_stencil stencil, ops_memspace *memspace)__
+
+This routine returns a pointer to the internally stored data, with MPI halo regions automatically updated as required by the supplied stencil. The strides required to index into the dataset are also given.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|stencil|     a stencil used to determine required MPI halo exchange depths|
+|memspace|       when set to OPS_HOST or OPS_DEVICE, returns a pointer to data in that memory space, otherwise must be set to 0, and returns whether data is in the host or on the device|
+
+#### ops_dat_release_raw_data
+
+__void ops_dat_release_raw_data(ops_dat dat, int part, ops_access acc)__
+
+Indicates to OPS that a dataset previously accessed with ops_dat_get_raw_pointer is released by the user, and also tells OPS how it was accessed.
+
+A single call to ops_dat_release_raw_data() releases all pointers obtained by previous calls to ops_dat_get_raw_pointer() calls on the same dat and with the same *memspace argument, i.e. calls do not nest.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset
+|part|        the chunk index (has to be 0)|
+|acc|     the kind of access that was used by the user (OPS_READ if it was read only, OPS_WRITE if it was overwritten, OPS_RW if it was read and written)|
+
+#### ops_dat_release_raw_data
+
+__void ops_dat_release_raw_data_memspace(ops_dat dat, int part, ops_access acc, ops_memspace *memspace)__
+
+Indicates to OPS that a dataset previously accessed with ops_dat_get_raw_pointer is released by the user, and also tells OPS how it was accessed, and which memory space was used.
+
+A single call to ops_dat_release_raw_data() releases all pointers obtained by previous calls to ops_dat_get_raw_pointer() calls on the same dat and with the same *memspace argument, i.e. calls do not nest.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|acc|     the kind of access that was used by the user (OPS_READ if it was read only, OPS_WRITE if it was overwritten, OPS_RW if it was read and written)|
+|memspace|       set to OPS_HOST or OPS_DEVICE |
+
+#### ops_dat_fetch_data
+
+__void ops_dat_fetch_data(ops_dat dat, int part, int *data)__
+This routine copies the data held by OPS to the user-specified memory location, which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0) |
+|data|        pointer to memory which should be filled by OPS|
+
+#### ops_dat_set_data
+
+__void ops_dat_set_data(ops_dat dat, int part, int *data)__
+
+This routine copies the data given  by the user to the internal data structure used by OPS. User data needs to be laid out in column-major order and strided as indicated by the sizes parameter of ops_dat_get_extents.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|data|        pointer to memory which should be copied to OPS |

From db0a4643d58ccc674751a9b6edd69049ed6b0f1e Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sat, 9 Oct 2021 21:54:02 +0100
Subject: [PATCH 091/324] Fix a few typos/latex tags in API

---
 doc/opsapi.md | 52 +++++++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 058754879c..3b81698c27 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -116,15 +116,15 @@ This routine defines a dataset.
 
 | Arguments      | Description |
 | ----------- | ----------- |
-block   |      structured block |
-dim     |      dimension of dataset (number of items per grid element) |
-size    |  size in each dimension of the block |
-base    |  base indices in each dimension of the block |
-d_m    |  padding from the face in the negative direction for each dimension (used for block halo) |
-d_p    |  padding from the face in the positive direction for each dimension (used for block halo) |
-data    |     input data of type *T* |
-type     |     the name of type used for output diagnostics (e.g. ``double``,``float``)|
-name     |     a name used for output diagnostics|
+|block   |      structured block |
+|dim     |      dimension of dataset (number of items per grid element) |
+|size    |  size in each dimension of the block |
+|base    |  base indices in each dimension of the block |
+|d_m    |  padding from the face in the negative direction for each dimension (used for block halo) |
+|d_p    |  padding from the face in the positive direction for each dimension (used for block halo) |
+|data    |     input data of type *T* |
+|type     |     the name of type used for output diagnostics (e.g. ``double``,``float``)|
+|name     |     a name used for output diagnostics|
 
 The `size` allows to declare different sized data arrays on a given
 `block`. `d_m` and `d_p` are depth of the "block halos" that are used to
@@ -158,7 +158,7 @@ This routine defines a global constant: a variable in global scope. Global const
 | ----------- | ----------- |
 |name |         a name used to identify the constant |
 |dim |           dimension of dataset (number of items per element) |
-|type |          the name of type used for output diagnostics (e.g. ``double'',``float'') |
+|type |          the name of type used for output diagnostics (e.g. ``double``, ``float``) |
 |data |          pointer to input data of type *T* |
 
 #### ops_decl_halo
@@ -205,7 +205,7 @@ This routine defines a collection of halos. Semantically, when an exchange is tr
 |nhalos|         number of halos in *halos* |
 |halos|           array of halos|
 
-#### ops_decl_reduction_handle}
+#### ops_decl_reduction_handle
 
 __ops_reduction ops_decl_reduction_handle(int size, char *type, char *name)__
 This routine defines a reduction handle to be used in a parallel loop
@@ -213,7 +213,7 @@ This routine defines a reduction handle to be used in a parallel loop
 | Arguments      | Description |
 | ----------- | ----------- |
 |size|      size of data in bytes |
-|type|          the name of type used for output diagnostics (e.g. ``double'',``float'') |
+|type|          the name of type used for output diagnostics (e.g. ``double``,``float``) |
 |name|          name of the dat used for output diagnostics|
 
 __{void ops_reduction_result(ops_reduction handle, T *result)
@@ -231,7 +231,7 @@ and ops_halo ops_decl_dat statements have been declared
 
 | Arguments      | Description |
 | ----------- | ----------- |
-|method|        string describing the partitioning method. Currently this string is not used internally, but is simply a place-holder to indicate different partitioning methods in the future. |
+|method| string describing the partitioning method. Currently this string is not used internally, but is simply a place-holder to indicate different partitioning methods in the future. |
 
 ### Diagnostic and output routines
 
@@ -306,7 +306,7 @@ a simple text editor
 |dat|  ops_dat to to be written|
 |file|     text file to write to|
 
-#### ops_timing_output}
+#### ops_timing_output
 
 __void ops_timing_output(FILE *os)__
 
@@ -316,11 +316,11 @@ Print OPS performance performance details to output stream
 | ----------- | ----------- |
 |os|    output stream, use stdout to print to standard out|
 
-#### ops_NaNcheck}
+#### ops_NaNcheck
 
 __void ops_NaNcheck(ops_dat dat)__
 
-Check if any of the values held in the \texttt{dat} is a NaN. If a NaN
+Check if any of the values held in the *dat* is a NaN. If a NaN
 is found, prints an error message and exits.
 
 | Arguments      | Description |
@@ -346,7 +346,7 @@ A parallel loop with N arguments has the following syntax:
 
 #### ops_par_loop
 
-__void ops_par_loop(\ void (*kernel)(...),char *name, ops_block block, int dims, int *range, ops_arg arg1,ops_arg arg2, ..., ops_arg argN )__
+__void ops_par_loop(void (*kernel)(...),char *name, ops_block block, int dims, int *range, ops_arg arg1,ops_arg arg2, ..., ops_arg argN )__
 
 | Arguments      | Description |
 | ----------- | ----------- |
@@ -357,7 +357,7 @@ __void ops_par_loop(\ void (*kernel)(...),char *name, ops_block block, int dims,
 |range|      iteration range array|
 |args|       arguments|
 
-The {\bf ops_arg} arguments in {\bf ops_par_loop} are provided by one of the
+The **ps_arg** arguments in **ops_par_loop** are provided by one of the
 following routines, one for global constants and reductions, and the other
 for OPS datasets.
 
@@ -454,7 +454,7 @@ accessed, for stencil point*p*, in dimension *m* are defined as
  stride[m]*loop_index[m] + stencil[p*dims+m]
 ```
 
- where ``loop_index[m]`` is the iteration index (within the
+where ``loop_index[m]`` is the iteration index (within the
 user-defined iteration space) in the different dimensions.
 
 If, for one or more dimensions, both ``stride[m]`` and
@@ -485,7 +485,7 @@ after a crash, the same number of MPI processes have to be used. To enable check
 
 __bool ops_checkpointing_init(const char *filename, double interval, int options)__
 
-Initialises the checkpointing system, has to be called after {\tt ops_partition}. Returns true if the application launches in restore
+Initialises the checkpointing system, has to be called after *ops_partition*. Returns true if the application launches in restore
 mode, false otherwise.
 
 | Arguments      | Description |
@@ -496,7 +496,7 @@ mode, false otherwise.
 
 * OPS_CHECKPOINT_INITPHASE - indicates that there are a number of parallel loops at the very beginning of the simulations which should be excluded from any checkpoint; mainly because they initialise datasets that do not change during the main body of the execution. During restore mode these loops are executed as usual. An example would be the computation of the mesh geometry, which can be excluded from the checkpoint if it is re-computed when recovering and restoring a checkpoint. The API call *void ops_checkpointing_initphase_done()* indicates the end of this initial phase.
 
-* OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually controls the location of the checkpoint, and explicitly specifies the list of \texttt{ops_dat}s to be saved.
+* OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually controls the location of the checkpoint, and explicitly specifies the list of *ops_dat*s to be saved.
 
 * OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the location of the checkpoint, and it also enables fast-forwarding, by skipping the execution of the
 application (even though none of the parallel loops would actually execute, there may be significant work outside of those) up to the checkpoint
@@ -561,7 +561,7 @@ it combines the manual datlist and fastfw calls, and triggers the creation of a
 |nbytes| size of the payload in bytes|
 |payload| pointer to memory into which the payload is packed|
 
-\noindent The suggested use of these \textbf{manual} functions is of course when the optimal location for checkpointing
+The suggested use of these **manual** functions is of course when the optimal location for checkpointing
 is known - one of the ways to determine that is to use the built-in algorithm. More details of this will be reported
 in a tech-report on checkpointing, to be published later.
 
@@ -581,8 +581,12 @@ This routine returns the number of chunks of the given dataset held by the curre
 
 #### ops_dat_get_global_npartitions}
 
-__int ops_dat_get_global_npartitions(ops_dat dat)}
-{This routine returns the number of chunks of the given dataset held by all processes.}
+__int ops_dat_get_global_npartitions(ops_dat dat)__
+
+This routine returns the number of chunks of the given dataset held by all processes.
+
+| Arguments      | Description |
+| ----------- | ----------- |
 |dat|         the dataset
 
 #### ops_dat_get_extents

From ebc6f95f5f9b661e39007ed1b906a0b4dfb89425 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sat, 9 Oct 2021 23:11:34 +0100
Subject: [PATCH 092/324] Add user kernel example to deveanapp

---
 doc/devanapp.md |  76 +++++
 doc/user.md     | 839 ------------------------------------------------
 2 files changed, 76 insertions(+), 839 deletions(-)
 delete mode 100644 doc/user.md

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 3004738caf..26a8f14339 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,5 +1,81 @@
 # Developing an OPS Application
+
 ## Tutorial
+##
+## OPS User Kernels
+
+In OPS, the elemental operation carried out per mesh/grid point is
+specified as an outlined function called a *user kernel*. An example
+taken from the Cloverleaf application is given below.
+
+```c++
+void accelerate_kernel( const ACC<double> &density0, const ACC<double> &volume,
+                ACC<double> &stepbymass, const ACC<double> &xvel0, ACC<double> &xvel1,
+                const ACC<double> &xarea, const ACC<double> &pressure,
+                const ACC<double> &yvel0, ACC<double> &yvel1,
+                const ACC<double> &yarea, const ACC<double> &viscosity) {
+
+  double nodal_mass;
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  nodal_mass = ( density0(-1,-1) * volume(-1,-1)
+    + density0(0,-1) * volume(0,-1)
+    + density0(0,0) * volume(0,0)
+    + density0(-1,0) * volume(-1,0) ) * 0.25;
+
+  stepbymass(0,0) = 0.5*dt/ nodal_mass;
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, 0,-1};
+
+  xvel1(0,0) = xvel0(0,0) - stepbymass(0,0) *
+            ( xarea(0,0)  * ( pressure(0,0) - pressure(-1,0) ) +
+              xarea(0,-1) * ( pressure(0,-1) - pressure(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, -1,0};
+
+  yvel1(0,0) = yvel0(0,0) - stepbymass(0,0) *
+            ( yarea(0,0)  * ( pressure(0,0) - pressure(0,-1) ) +
+              yarea(-1,0) * ( pressure(-1,0) - pressure(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, 0,-1};
+
+  xvel1(0,0) = xvel1(0,0) - stepbymass(0,0) *
+            ( xarea(0,0) * ( viscosity(0,0) - viscosity(-1,0) ) +
+              xarea(0,-1) * ( viscosity(0,-1) - viscosity(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, -1,0};
+
+  yvel1(0,0) = yvel1(0,0) - stepbymass(0,0) *
+            ( yarea(0,0) * ( viscosity(0,0) - viscosity(0,-1) ) +
+              yarea(-1,0) * ( viscosity(-1,0) - viscosity(-1,-1) ) );
+
+}
+```
+
+This user kernel is then used in an `ops_par_loop` function. The key aspect to note in the user kernel is the use of the ACC\<\> objects and their
+parentheses operator. These specify the stencil in accessing the
+elements of the respective data arrays.
+
+```c++
+int rangexy_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1};
+
+ops_par_loop(accelerate_kernel, "accelerate_kernel", clover_grid, 2, rangexy_inner_plus1,
+     ops_arg_dat(density0, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(volume, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE),
+     ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_READ),
+     ops_arg_dat(xvel1, 1, S2D_00, "double", OPS_INC),
+     ops_arg_dat(xarea, 1, S2D_00_0M1, "double", OPS_READ),
+     ops_arg_dat(pressure, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_READ),
+     ops_arg_dat(yvel1, 1, S2D_00, "double", OPS_INC),
+     ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ),
+     ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ));
+```
 ## Supported Paralleizations
 ## Code-generation Flags
 ## File I/O
diff --git a/doc/user.md b/doc/user.md
deleted file mode 100644
index d3ebca8478..0000000000
--- a/doc/user.md
+++ /dev/null
@@ -1,839 +0,0 @@
----
-author:
-- Mike Giles, Istvan Reguly, Gihan Mudalige
-date: May 2019
-title: OPS C++ User's Manual
----
-
-
-
-
-
-# OPS C++ API
-
-## Initialisation declaration and termination routines
-
-###  {#section .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the usual command line arguments
-
-an integer which defines the level of debugging diagnostics and
-reporting to be performed
-:::
-
-Currently, higher `diags_level`s does the following checks\
-`diags_level` $=$ 1 : no diagnostics, default to achieve best runtime
-performance.\
-`diags_level` $>$ 1 : print block decomposition and `ops_par_loop`
-timing breakdown.\
-`diags_level` $>$ 4 : print intra-block halo buffer allocation feedback
-(for OPS internal development only)\
-`diags_level` $>$ 5 : check if intra-block halo MPI sends depth match
-MPI receives depth (for OPS internal development only)\
-
-###  {#section-1 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dimension of the block
-
-a name used for output diagnostics
-:::
-
-###  {#section-2 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dimension of the block
-
-a name used for output diagnostics
-
-hdf5 file to read and obtain the block information from
-:::
-
-Although this routine does not read in any extra information about the
-block from the named HDF5 file than what is already specified in the
-arguments, it is included here for error checking (e.g. check if blocks
-defined in an HDF5 file is matching with the declared arguments in an
-application) and completeness.\
-
-###  {#section-3 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-structured block
-
-dimension of dataset (number of items per grid element)
-
-size in each dimension of the block
-
-base indices in each dimension of the block
-
-padding from the face in the negative direction for each dimension (used
-for block halo)
-
-padding from the face in the positive direction for each dimension (used
-for block halo)
-
-input data of type `T`
-
-the name of type used for output diagnostics (e.g. "double", "float")
-
-a name used for output diagnostics
-:::
-
-The `size` allows to declare different sized data arrays on a given
-`block`. `d_m` and `d_p` are depth of the "block halos" that are used to
-indicate the offset from the edge of a block (in both the negative and
-positive directions of each dimension).\
-\
-
-###  {#section-4 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-structured block
-
-dimension of dataset (number of items per grid element)
-
-the name of type used for output diagnostics (e.g. "double", "float")
-
-name of the dat used for output diagnostics
-
-hdf5 file to read and obtain the data from
-:::
-
-###  {#section-5 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-a name used to identify the constant
-
-dimension of dataset (number of items per element)
-
-the name of type used for output diagnostics (e.g. "double", "float")
-
-pointer to input data of type `T`
-:::
-
-###  {#section-6 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-a name used to identify the constant
-
-dimension of dataset (number of items per element)
-
-the name of type used for output diagnostics (e.g. "double", "float")
-
-pointer to new values for constant of type `T`
-:::
-
-###  {#section-7 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-origin dataset
-
-destination dataset
-
-defines an iteration size (number of indices to iterate over in each
-direction)
-
-indices of starting point in \"from\" dataset
-
-indices of starting point in \"to\" dataset
-
-direction of incrementing for \"from\" for each dimension of `iter_size`
-
-direction of incrementing for \"to\" for each dimension of `iter_size`
-:::
-
-A from_dir \[1,2\] and a to_dir \[2,1\] means that x in the first block
-goes to y in the second block, and y in first block goes to x in second
-block. A negative sign indicates that the axis is flipped. (Simple
-example: a transfer from (1:2,0:99,0:99) to (-1:0,0:99,0:99) would use
-iter_size = \[2,100,100\], from_base = \[1,0,0\], to_base = \[-1,0,0\],
-from_dir = \[0,1,2\], to_dir = \[0,1,2\]. In more complex case this
-allows for transfers between blocks with different orientations.)\
-
-###  {#section-8 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-origin dataset
-
-destination dataset
-
-hdf5 file to read and obtain the data from
-:::
-
-###  {#section-9 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-number of halos in `halos`
-
-array of halos
-:::
-
-###  {#section-10 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-size of data in bytes
-
-the name of type used for output diagnostics (e.g. "double", "float")
-
-name of the dat used for output diagnostics
-:::
-
-::: list
-plus 1pt minus 1pt
-
-the `ops_reduction` handle
-
-a pointer to write the results to, memory size has to match the declared
-:::
-
-###  {#section-11 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-string describing the partitioning method. Currently this string is not
-used internally, but is simply a place-holder to indicate different
-partitioning methods in the future.
-:::
-
-###  {#section-12 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-:::
-
-## Diagnostics and output routines
-
-###  {#section-13 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-:::
-
-###  {#section-14 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-:::
-
-###  {#section-15 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-variable to hold the CPU time at the time of invocation
-
-variable to hold the elapsed time at the time of invocation
-:::
-
-###  {#section-16 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-ops_block to be written
-
-hdf5 file to write to
-:::
-
-###  {#section-17 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-ops_stencil to be written
-
-hdf5 file to write to
-:::
-
-###  {#section-18 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-ops_dat to be written
-
-hdf5 file to write to
-:::
-
-###  {#section-19 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-ops_dat to to be written
-
-text file to write to
-:::
-
-###  {#section-20 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-output stream, use stdout to print to standard out
-:::
-
-###  {#section-21 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-ops_dat to to be checked
-:::
-
-## Halo exchange
-
-###  {#section-22 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the halo group
-:::
-
-## Parallel loop syntax
-
-A parallel loop with N arguments has the following syntax:
-
-###  {#section-23 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-user's kernel function with N arguments
-
-name of kernel function, used for output diagnostics
-
-the ops_block over which this loop executes
-
-dimension of loop iteration
-
-iteration range array
-
-arguments
-:::
-
-The **ops_arg** arguments in **ops_par_loop** are provided by one of the
-following routines, one for global constants and reductions, and the
-other for OPS datasets.
-
-###  {#section-24 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-data array
-
-array dimension
-
-string representing the type of data held in data
-
-access type
-:::
-
-###  {#section-25 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-an `ops_reduction` handle
-
-array dimension (according to `type`)
-
-string representing the type of data held in data
-
-access type
-:::
-
-###  {#section-26 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dataset
-
-stencil for accessing data
-
-string representing the type of data held in dataset
-
-access type
-:::
-
-###  {#section-27 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-:::
-
-## Stencils
-
-The final ingredient is the stencil specification, for which we have two
-versions: simple and strided.\
-
-###  {#section-28 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dimension of loop iteration
-
-number of points in the stencil
-
-stencil for accessing data
-
-string representing the name of the stencil
-:::
-
-###  {#section-29 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dimension of loop iteration
-
-number of points in the stencil
-
-stencil for accessing data
-
-stride for accessing data
-
-string representing the name of the stencil\
-:::
-
-###  {#section-30 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dimension of loop iteration
-
-number of points in the stencil
-
-string representing the name of the stencil
-
-hdf5 file to write to
-:::
-
-In the strided case, the semantics for the index of data to be accessed,
-for stencil point `p`, in dimension `m` are defined as:\
-,\
-where `loop_index[m]` is the iteration index (within the user-defined
-iteration space) in the different dimensions.
-
-If, for one or more dimensions, both `stride[m]` and `stencil[p*dims+m]`
-are zero, then one of the following must be true;
-
--   the dataset being referenced has size 1 for these dimensions
-
--   these dimensions are to be omitted and so the dataset has dimension
-    equal to the number of remaining dimensions.
-
-See `OPS/apps/c/CloverLeaf/build_field.cpp` and
-`OPS/apps/c/CloverLeaf/generate.cpp` for an example
-`ops_decl_strided_stencil` declaration and its use in a loop,
-respectively.\
-These two stencil definitions probably take care of all of the cases in
-the Introduction except for multiblock applications with interfaces with
-different orientations -- this will need a third, even more general,
-stencil specification. The strided stencil will handle both multigrid
-(with a stride of 2 for example) and the boundary condition and reduced
-dimension applications (with a stride of 0 for the relevant dimensions).
-
-## Checkpointing
-
-OPS supports the automatic checkpointing of applications. Using the API
-below, the user specifies the file name for the checkpoint and an
-average time interval between checkpoints, OPS will then automatically
-save all necessary information periodically that is required to
-fast-forward to the last checkpoint if a crash occurred. Currently, when
-re-launching after a crash, the same number of MPI processes have to be
-used. To enable checkpointing mode, the `OPS_CHECKPOINT` runtime
-argument has to be used.\
-
-###  {#section-31 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-name of the file for checkpointing. In MPI, this will automatically be
-post-fixed with the rank ID.
-
-average time (seconds) between checkpoints
-
-a combinations of flags, listed in `ops_checkpointing.h`:\
-OPS_CHECKPOINT_INITPHASE - indicates that there are a number of parallel
-loops at the very beginning of the simulations which should be excluded
-from any checkpoint; mainly because they initialise datasets that do not
-change during the main body of the execution. During restore mode these
-loops are executed as usual. An example would be the computation of the
-mesh geometry, which can be excluded from the checkpoint if it is
-re-computed when recovering and restoring a checkpoint. The API call
-void `ops_checkpointing_initphase_done()` indicates the end of this
-initial phase.
-
-OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually
-controls the location of the checkpoint, and explicitly specifies the
-list of `ops_dat`s to be saved.
-
-OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the
-location of the checkpoint, and it also enables fast-forwarding, by
-skipping the execution of the application (even though none of the
-parallel loops would actually execute, there may be significant work
-outside of those) up to the checkpoint.
-
-OPS_CHECKPOINT_MANUAL - Indicates that when the corresponding API
-function is called, the checkpoint should be created. Assumes the
-presence of the above two options as well.
-:::
-
-###  {#section-32 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-number of datasets to be saved
-
-arrays of `ops_dat` handles to be saved
-:::
-
-###  {#section-33 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-size of the payload in bytes
-
-pointer to memory into which the payload is packed
-:::
-
-###  {#section-34 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-number of datasets to be saved
-
-arrays of `ops_dat` handles to be saved
-
-size of the payload in bytes
-
-pointer to memory into which the payload is packed
-:::
-
-###  {#section-35 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-number of datasets to be saved
-
-arrays of `ops_dat` handles to be saved
-
-size of the payload in bytes
-
-pointer to memory into which the payload is packed
-:::
-
-The suggested use of these **manual** functions is of course when the
-optimal location for checkpointing is known - one of the ways to
-determine that is to use the built-in algorithm. More details of this
-will be reported in a tech-report on checkpointing, to be published
-later.
-
-## Access to OPS data
-
-This section describes APIS that give the user access to internal data
-structures in OPS and return data to user-space. These should be used
-cautiously and sparsely, as they can affect performance significantly
-
-###  {#section-36 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-:::
-
-###  {#section-37 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-:::
-
-###  {#section-38 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-an array populated with the displacement of the chunk within the
-"global" distributed array
-
-an array populated with the spatial extents
-:::
-
-###  {#section-39 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-an array populated with the displacement of the chunk within the
-"global" distributed array
-
-an array populated with the spatial extents
-
-an array populated strides in spatial dimensions needed for column-major
-indexing
-
-an array populated with padding on the left in each dimension. Note that
-these are negative values
-
-an array populated with padding on the right in each dimension
-:::
-
-###  {#section-40 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-a stencil used to determine required MPI halo exchange depths
-
-when set to OPS_HOST or OPS_DEVICE, returns a pointer to data in that
-memory space, otherwise must be set to 0, and returns whether data is in
-the host or on the device
-:::
-
-###  {#section-41 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-the kind of access that was used by the user (OPS_READ if it was read
-only, OPS_WRITE if it was overwritten, OPS_RW if it was read and
-written)
-:::
-
-###  {#section-42 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-the kind of access that was used by the user (OPS_READ if it was read
-only, OPS_WRITE if it was overwritten, OPS_RW if it was read and
-written)
-
-set to OPS_HOST or OPS_DEVICE
-:::
-
-###  {#section-43 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-pointer to memory which should be filled by OPS
-:::
-
-###  {#section-44 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-pointer to memory which should be copied to OPS
-:::
-
-# Tiling for Cache-blocking
-
-OPS has a code generation (ops_gen_mpi_lazy) and build target for
-tiling. Once compiled, to enable, use the `OPS_TILING` runtime parameter
-- this will look at the L3 cache size of your CPU and guess the correct
-tile size. If you want to alter the amount of cache to be used for the
-guess, use the `OPS_CACHE_SIZE=XX` runtime parameter, where the value is
-in Megabytes. To manually specify the tile sizes, use the
-OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments.
-
-When MPI is combined with OpenMP tiling can be extended to the MPI
-halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
-halos for multiple `ops_par_loops` can be exchanged with a single MPI
-message (see [@TPDS2017] for more details)\
-To test, compile CloverLeaf under `apps/c/CloverLeaf`, modify clover.in
-to use a $6144^2$ mesh, then run as follows:\
-For OpenMP with tiling:\
-`export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING`\
-For MPI+OpenMP with tiling:\
-`export OMP_NUM_THREADS=xx; mpirun -np xx ./cloverleaf_mpi_tiled OPS_TILING OPS_TILING_MAXDEPTH=6`\
-To manually specify the tile sizes (in number of grid points), use the
-OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:\
-`export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200 `
-
-# CUDA and OpenCL Runtime Arguments
-
-The CUDA (and OpenCL) thread block sizes can be controlled by setting
-the `OPS_BLOCK_SIZE_X, OPS_BLOCK_SIZE_Y` and `OPS_BLOCK_SIZE_Z` runtime
-arguments. For example :\
-`./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4`\
-`OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the
-code on.\
-Usually `OPS_CL_DEVICE=0` selects the CPU and `OPS_CL_DEVICE=1` selects
-GPUs.
-
-# Executing with GPUDirect
-
-GPU direct support for MPI+CUDA, to enable (on the OPS side) add
-**-gpudirect** when running the executable. You may also have to use
-certain environmental flags when using different MPI distributions. For
-an example of the required flags and environmental settings on the
-Cambridge Wilkes2 GPU cluster see:\
-<https://docs.hpc.cam.ac.uk/hpc/user-guide/performance-tips.html>
-
-# OPS User Kernels
-
-In OPS, the elemental operation carried out per mesh/grid point is
-specified as an outlined function called a *user kernel*. An example
-taken from the Cloverleaf application is given in Figure
-[\[fig:example\]](#fig:example){reference-type="ref"
-reference="fig:example"}.\
-
-``` {.cpp mathescape="" linenos="" startFrom="1" numbersep="0pt" gobble="2" frame="lines" framesep="1mm"}
-void accelerate_kernel( const ACC<double> &density0, const ACC<double> &volume,
-                ACC<double> &stepbymass, const ACC<double> &xvel0, ACC<double> &xvel1,
-                const ACC<double> &xarea, const ACC<double> &pressure,
-                const ACC<double> &yvel0, ACC<double> &yvel1,
-                const ACC<double> &yarea, const ACC<double> &viscosity) {
-
-  double nodal_mass;
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  nodal_mass = ( density0(-1,-1) * volume(-1,-1)
-    + density0(0,-1) * volume(0,-1)
-    + density0(0,0) * volume(0,0)
-    + density0(-1,0) * volume(-1,0) ) * 0.25;
-
-  stepbymass(0,0) = 0.5*dt/ nodal_mass;
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, 0,-1};
-
-  xvel1(0,0) = xvel0(0,0) - stepbymass(0,0) *
-            ( xarea(0,0)  * ( pressure(0,0) - pressure(-1,0) ) +
-              xarea(0,-1) * ( pressure(0,-1) - pressure(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, -1,0};
-
-  yvel1(0,0) = yvel0(0,0) - stepbymass(0,0) *
-            ( yarea(0,0)  * ( pressure(0,0) - pressure(0,-1) ) +
-              yarea(-1,0) * ( pressure(-1,0) - pressure(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, 0,-1};
-
-  xvel1(0,0) = xvel1(0,0) - stepbymass(0,0) *
-            ( xarea(0,0) * ( viscosity(0,0) - viscosity(-1,0) ) +
-              xarea(0,-1) * ( viscosity(0,-1) - viscosity(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, -1,0};
-
-  yvel1(0,0) = yvel1(0,0) - stepbymass(0,0) *
-            ( yarea(0,0) * ( viscosity(0,0) - viscosity(0,-1) ) +
-              yarea(-1,0) * ( viscosity(-1,0) - viscosity(-1,-1) ) );
-
-
-}
-```
-
-[\[fig:example\]]{#fig:example label="fig:example"}
-
-\
-\
-\
-\
-This user kernel is then used in an `ops_par_loop` (Figure
-[\[fig:parloop\]](#fig:parloop){reference-type="ref"
-reference="fig:parloop"}). The key aspect to note in the user kernel in
-Figure [\[fig:example\]](#fig:example){reference-type="ref"
-reference="fig:example"} is the use of the ACC\<\> objects and their
-parentheses operator. These specify the stencil in accessing the
-elements of the respective data arrays.
-
-``` {.cpp mathescape="" linenos="" startFrom="1" numbersep="0pt" gobble="2" frame="lines" framesep="2mm"}
-    int rangexy_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1};
-
-    ops_par_loop(accelerate_kernel, "accelerate_kernel", clover_grid, 2, rangexy_inner_plus1,
-     ops_arg_dat(density0, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(volume, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE),
-     ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_READ),
-     ops_arg_dat(xvel1, 1, S2D_00, "double", OPS_INC),
-     ops_arg_dat(xarea, 1, S2D_00_0M1, "double", OPS_READ),
-     ops_arg_dat(pressure, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_READ),
-     ops_arg_dat(yvel1, 1, S2D_00, "double", OPS_INC),
-     ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ),
-     ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ));
-```
-
-[\[fig:parloop\]]{#fig:parloop label="fig:parloop"}
-
-::: thebibliography
-1 OP2 for Many-Core Platforms, 2013.
-<http://www.oerc.ox.ac.uk/projects/op2>
-
-Istvan Z. Reguly, G.R. Mudalige, Mike B. Giles. Loop Tiling in
-Large-Scale Stencil Codes at Run-time with OPS. (2017) IEEE Transactions
-on Parallel and Distributed Systems.
-<http://dx.doi.org/10.1109/TPDS.2017.2778161>
-:::

From 1d34162b8198b690008fb87b28cd80fe93516ce3 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sat, 9 Oct 2021 23:55:19 +0100
Subject: [PATCH 093/324] Gitflow work flow model

---
 doc/devdoc.md | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index 5e906c5729..e959787cdd 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -1,11 +1,44 @@
-# Developer Guide 
+# Developer Guide
 ## Code-generator
 ### Frontend API parser
 ### Target Parallel Templates
 ### Elemental Kernel Transformations
 ## Back-end Library
 ### Sequential and multi-threaded CPU
-### MPI and Partitioning 
-### HDF5 
+### MPI and Partitioning
+### HDF5
 ### CUDA
 ### Cache blocking tiling and comm-avoiding optimizations
+## Git work flow for contribution
+To facilitate the concept of "Version" and "Release", we adopt the [Gitflow Workflow model](#https://nvie.com/posts/a-successful-git-branching-model/).
+### Overall work flow
+
+1. Create develop branch from main
+
+2. Create release branch from develop
+
+   After creating a release branch, only documentation and bug fixes will be added this branch.
+
+3. Create feature branches from develop
+
+4. Merge a feature branch into the develop branch once completed
+
+5. Merge release branch into develop and main once completed
+
+6. Create a hotfix branch from main if an issue is identified
+
+7. Merge a hotfix branch to both develop and main once fixed
+
+See also https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow.
+
+### A few issues
+Using the Gitflow model tends to produce a few long-live branches (e.g.,  feature), which may increase the risk of "conflicts" for intergration. To migrate this, we encourage the following practice
+
+* Try to create short-lived branches with a few small commites when possbile (e.g., a hotfix branch)
+* Once a branch properly merges or a feature finalised, delete the branch
+* A feature branch tends to be long-live, try to split a feature into "milestones" and merge into the develop branch when finishing each milestone.
+
+**The Gitflow tool will automatically delete a branch once it is finished.**
+### Gitflow tool
+
+see https://github.com/nvie/gitflow
\ No newline at end of file

From 7d48a7092d8768aaa961dd9e611a0420c0f24ecf Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 22:17:47 +0100
Subject: [PATCH 094/324] Try to build doxygen

---
 doc/conf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index c53bf5a553..3815b16821 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -13,7 +13,9 @@
 # import os
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
-
+import subprocess
+subprocess.call('doxygen doc/ops/Doxyfile', shell=True)
+html_extra_path = ['doc/ops/html']
 
 # -- Project information -----------------------------------------------------
 

From 2c613cbff0be266f6df6431ff4d1eb793c58a40b Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 22:24:18 +0100
Subject: [PATCH 095/324] Tune Doxygen Dir

---
 doc/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index 3815b16821..a4206b8354 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -14,8 +14,8 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 import subprocess
-subprocess.call('doxygen doc/ops/Doxyfile', shell=True)
-html_extra_path = ['doc/ops/html']
+subprocess.call('doxygen ops/Doxyfile', shell=True)
+html_extra_path = ['ops/html']
 
 # -- Project information -----------------------------------------------------
 

From d86ce7da93f76b82810d363065cd167a32970cfd Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 22:47:43 +0100
Subject: [PATCH 096/324] Try Copy Doxygen

---
 doc/conf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index a4206b8354..97b7b120d5 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -15,7 +15,8 @@
 # sys.path.insert(0, os.path.abspath('.'))
 import subprocess
 subprocess.call('doxygen ops/Doxyfile', shell=True)
-html_extra_path = ['ops/html']
+subprocess.call('cp ops/html/ . -r', shell=True)
+#html_extra_path = ['ops/html']
 
 # -- Project information -----------------------------------------------------
 

From d77c75e0969fc9e3d53a2c3a556fc57d1cb4a039 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 22:55:43 +0100
Subject: [PATCH 097/324] Not copy

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index 97b7b120d5..12886a4891 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -15,7 +15,7 @@
 # sys.path.insert(0, os.path.abspath('.'))
 import subprocess
 subprocess.call('doxygen ops/Doxyfile', shell=True)
-subprocess.call('cp ops/html/ . -r', shell=True)
+#subprocess.call('cp ops/html/ . -r', shell=True)
 #html_extra_path = ['ops/html']
 
 # -- Project information -----------------------------------------------------

From a137cac5eaa440fccee3e6bf4549bf4878935907 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 23:22:58 +0100
Subject: [PATCH 098/324] Stoo sphinx

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index 12886a4891..ac5f42dbae 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -14,7 +14,7 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 import subprocess
-subprocess.call('doxygen ops/Doxyfile', shell=True)
+#subprocess.call('doxygen ops/Doxyfile', shell=True)
 #subprocess.call('cp ops/html/ . -r', shell=True)
 #html_extra_path = ['ops/html']
 

From 5b4dae0aa81399ca637e45c899853f50d902af18 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 23:33:40 +0100
Subject: [PATCH 099/324] Try Gitlab for doxygen

---
 .gitlab-ci.yml | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 967e4c0461..7a2b67b1cb 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -228,19 +228,32 @@ test:Intel:
   only:
     - master
 
-#Stage "docs"
-make-docs:
-  stage: docs
-  when: always
+# #Stage "docs"
+# make-docs:
+#   stage: docs
+#   when: always
+#   tags:
+#     - CCP, test
+#   script:
+#     - cd doc && make all clean
+#   artifacts:
+#     expire_in: 4 week
+#     paths:
+#       - doc/user.pdf
+#       - doc/ops/html
+#       - doc/ops/latex/refman.pdf
+#       - doc/ops_translator/html
+#       - doc/ops_translator/latex/refman.pdf
+
+pages:
   tags:
     - CCP, test
   script:
-    - cd doc && make all clean
+  - cd doc
+  - doxygen ops/Doxyfile
+  - mv ops/html/ public/
   artifacts:
-    expire_in: 4 week
     paths:
-      - doc/user.pdf
-      - doc/ops/html
-      - doc/ops/latex/refman.pdf
-      - doc/ops_translator/html
-      - doc/ops_translator/latex/refman.pdf
\ No newline at end of file
+    - public
+  rules:
+    - if: $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH

From be2098b0d4de8311829a2896d7c4c5abe7f3aa2b Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 23:49:36 +0100
Subject: [PATCH 100/324] Add stage

---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7a2b67b1cb..e52a9b9d69 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -246,6 +246,7 @@ test:Intel:
 #       - doc/ops_translator/latex/refman.pdf
 
 pages:
+  stage: docs
   tags:
     - CCP, test
   script:

From e9dc6880285942f386730ad216bf174584b74581 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 23:53:47 +0100
Subject: [PATCH 101/324] Adjust when to generate doxygen

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e52a9b9d69..0d214859d9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -247,6 +247,7 @@ test:Intel:
 
 pages:
   stage: docs
+  when: always
   tags:
     - CCP, test
   script:
@@ -256,5 +257,4 @@ pages:
   artifacts:
     paths:
     - public
-  rules:
-    - if: $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH
+

From 1399045662939f84833a2c44314d0f56cccb8e81 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Mon, 11 Oct 2021 00:18:04 +0100
Subject: [PATCH 102/324] Correct public dir

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0d214859d9..b6b24b90c7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -256,5 +256,5 @@ pages:
   - mv ops/html/ public/
   artifacts:
     paths:
-    - public
+    - doc/public
 

From 5778979eab675e78ab9b4e6f8016780f25266646 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Mon, 11 Oct 2021 09:12:39 +0100
Subject: [PATCH 103/324] Correct Dir

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b6b24b90c7..63e5627fd0 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -253,8 +253,8 @@ pages:
   script:
   - cd doc
   - doxygen ops/Doxyfile
-  - mv ops/html/ public/
+  - mv ops/html/ $CI_PROJECT_DIR/public/
   artifacts:
     paths:
-    - doc/public
+    - public
 

From e8f0b79b191df893de07086ce103eff5689739cb Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Mon, 11 Oct 2021 09:46:15 +0100
Subject: [PATCH 104/324] Add doxygen comment link

---
 doc/opsapi.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 3b81698c27..1b40c04f7a 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -682,3 +682,6 @@ This routine copies the data given  by the user to the internal data structure u
 |dat|         the dataset|
 |part|        the chunk index (has to be 0)|
 |data|        pointer to memory which should be copied to OPS |
+
+### Doxygen comments for APIs
+We also provide Doxygen comments in for using APIs, please view [here](#https://op-dsl-ci.gitlab.io/ops-ci/).
\ No newline at end of file

From 39800991b9955f16ee7879828d0f2ac504b7c4c4 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Mon, 11 Oct 2021 09:48:53 +0100
Subject: [PATCH 105/324] Repair the Doxgen link

---
 doc/opsapi.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 1b40c04f7a..71c4e11058 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -684,4 +684,4 @@ This routine copies the data given  by the user to the internal data structure u
 |data|        pointer to memory which should be copied to OPS |
 
 ### Doxygen comments for APIs
-We also provide Doxygen comments in for using APIs, please view [here](#https://op-dsl-ci.gitlab.io/ops-ci/).
\ No newline at end of file
+We also provide Doxygen comments in for using APIs, please view [here](https://op-dsl-ci.gitlab.io/ops-ci/).
\ No newline at end of file

From 6910e9a99ee875d77c906f3410b1f1eb0ca8951e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 11 Oct 2021 12:02:11 +0100
Subject: [PATCH 106/324] Update opsapi.md

---
 doc/opsapi.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 71c4e11058..b8c9a908e7 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -683,5 +683,5 @@ This routine copies the data given  by the user to the internal data structure u
 |part|        the chunk index (has to be 0)|
 |data|        pointer to memory which should be copied to OPS |
 
-### Doxygen comments for APIs
-We also provide Doxygen comments in for using APIs, please view [here](https://op-dsl-ci.gitlab.io/ops-ci/).
\ No newline at end of file
+## Doxygen
+Doxygen generated from OPS source can be found [here](https://op-dsl-ci.gitlab.io/ops-ci/).

From bca053a15d2cfb86b66307d3f184ee959b567d69 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 11 Oct 2021 12:14:28 +0100
Subject: [PATCH 107/324] Update installation.md

---
 doc/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index ec6acbc0c8..ae1f68b90e 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -144,4 +144,4 @@ make
 ```  
 <!---#### Makefile options -->
 
-## Runtime Flags and Options
+

From fb27a6c67d1ba9df52ed7ce311790f69bea2217e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 11 Oct 2021 12:15:11 +0100
Subject: [PATCH 108/324] Update devanapp.md

---
 doc/devanapp.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 26a8f14339..cc4e304057 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -76,6 +76,7 @@ ops_par_loop(accelerate_kernel, "accelerate_kernel", clover_grid, 2, rangexy_inn
      ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ),
      ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ));
 ```
+## File I/O
 ## Supported Paralleizations
 ## Code-generation Flags
-## File I/O
+## Runtime Flags and Options

From e80d6666ec3048f6846e5038f66e992e81636938 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 11 Oct 2021 12:17:20 +0100
Subject: [PATCH 109/324] Update devanapp.md

---
 doc/devanapp.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index cc4e304057..d9adc96451 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,7 +1,7 @@
 # Developing an OPS Application
 
 ## Tutorial
-##
+
 ## OPS User Kernels
 
 In OPS, the elemental operation carried out per mesh/grid point is

From 988d205808df675694400f044dc294996606814a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 12:10:17 +0100
Subject: [PATCH 110/324] Update devanapp.md

---
 doc/devanapp.md | 91 +++++++++----------------------------------------
 1 file changed, 16 insertions(+), 75 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index d9adc96451..eb674cea26 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,82 +1,23 @@
 # Developing an OPS Application
 
-## Tutorial
 
-## OPS User Kernels
+## OPS Abstraction
+## Example Application
+## Original - Initialisation
+## Original - Boundary loops
+## Original - Main iteration
+## Build OPS
+## Step 1 - Preparing to use OPS
+## Step 2 - OPS declarations
+## Step 3 - First parallel loop
+## Step 4 - Indexes and global constants
+## Step 5 - Complex stencils and reductions
+## Step 6 - Handing it all to OPS
+## Step 7 - Code generation
+## Code generated versions
+## Optimizations - general
+## Optimizations - tiling
 
-In OPS, the elemental operation carried out per mesh/grid point is
-specified as an outlined function called a *user kernel*. An example
-taken from the Cloverleaf application is given below.
-
-```c++
-void accelerate_kernel( const ACC<double> &density0, const ACC<double> &volume,
-                ACC<double> &stepbymass, const ACC<double> &xvel0, ACC<double> &xvel1,
-                const ACC<double> &xarea, const ACC<double> &pressure,
-                const ACC<double> &yvel0, ACC<double> &yvel1,
-                const ACC<double> &yarea, const ACC<double> &viscosity) {
-
-  double nodal_mass;
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  nodal_mass = ( density0(-1,-1) * volume(-1,-1)
-    + density0(0,-1) * volume(0,-1)
-    + density0(0,0) * volume(0,0)
-    + density0(-1,0) * volume(-1,0) ) * 0.25;
-
-  stepbymass(0,0) = 0.5*dt/ nodal_mass;
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, 0,-1};
-
-  xvel1(0,0) = xvel0(0,0) - stepbymass(0,0) *
-            ( xarea(0,0)  * ( pressure(0,0) - pressure(-1,0) ) +
-              xarea(0,-1) * ( pressure(0,-1) - pressure(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, -1,0};
-
-  yvel1(0,0) = yvel0(0,0) - stepbymass(0,0) *
-            ( yarea(0,0)  * ( pressure(0,0) - pressure(0,-1) ) +
-              yarea(-1,0) * ( pressure(-1,0) - pressure(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, 0,-1};
-
-  xvel1(0,0) = xvel1(0,0) - stepbymass(0,0) *
-            ( xarea(0,0) * ( viscosity(0,0) - viscosity(-1,0) ) +
-              xarea(0,-1) * ( viscosity(0,-1) - viscosity(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, -1,0};
-
-  yvel1(0,0) = yvel1(0,0) - stepbymass(0,0) *
-            ( yarea(0,0) * ( viscosity(0,0) - viscosity(0,-1) ) +
-              yarea(-1,0) * ( viscosity(-1,0) - viscosity(-1,-1) ) );
-
-}
-```
-
-This user kernel is then used in an `ops_par_loop` function. The key aspect to note in the user kernel is the use of the ACC\<\> objects and their
-parentheses operator. These specify the stencil in accessing the
-elements of the respective data arrays.
-
-```c++
-int rangexy_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1};
-
-ops_par_loop(accelerate_kernel, "accelerate_kernel", clover_grid, 2, rangexy_inner_plus1,
-     ops_arg_dat(density0, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(volume, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE),
-     ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_READ),
-     ops_arg_dat(xvel1, 1, S2D_00, "double", OPS_INC),
-     ops_arg_dat(xarea, 1, S2D_00_0M1, "double", OPS_READ),
-     ops_arg_dat(pressure, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_READ),
-     ops_arg_dat(yvel1, 1, S2D_00, "double", OPS_INC),
-     ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ),
-     ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ));
-```
-## File I/O
 ## Supported Paralleizations
 ## Code-generation Flags
 ## Runtime Flags and Options

From 6f4d4a15f511bd257cf23ca7562c7c7afc1081ac Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 12:22:03 +0100
Subject: [PATCH 111/324] Update devanapp.md

---
 doc/devanapp.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index eb674cea26..b6592298f6 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,7 +1,14 @@
 # Developing an OPS Application
-
+This page provides a tutorial in the basics of using OPS for multi-block structured mesh application development. This is taken from a [presentation](https://op-dsl.github.io/docs/OPS/tutorial.pdf) given initially in April 2018 and subsequently updated for the latest release of OPS. 
 
 ## OPS Abstraction
+OPS is a Domain Specific Language embedded in C/C++ and Fortran, targeting the development of multi-block structured mesh computations. The abstraction has two distinct components:  the definition of the mesh, and operations over the mesh.
+* Defining a number of 1-3D blocks, and on them a number of datasets, which have specific extents in the different dimensions.
+* Describing a parallel loop over a given block, with a given iteration range, executing a given "kernel function" at each grid point, and describing what datasets are going to be accessed and how.
+* Additionally, one needs to declare stencils (access patterns) that will be used in parallel loops to access datasets, and any global constants (read-only global scope variables)
+
+Data and computations expressed this way can be automatically managed and parallelised by the OPS library. Higher dimensions supported in the backend, but not currently by the code generators.
+
 ## Example Application
 ## Original - Initialisation
 ## Original - Boundary loops

From f1b7764ca4b4c95baa0b6c23c708325e4edbd926 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 12:26:10 +0100
Subject: [PATCH 112/324] Update devanapp.md

---
 doc/devanapp.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index b6592298f6..7ba242e157 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -7,9 +7,16 @@ OPS is a Domain Specific Language embedded in C/C++ and Fortran, targeting the d
 * Describing a parallel loop over a given block, with a given iteration range, executing a given "kernel function" at each grid point, and describing what datasets are going to be accessed and how.
 * Additionally, one needs to declare stencils (access patterns) that will be used in parallel loops to access datasets, and any global constants (read-only global scope variables)
 
-Data and computations expressed this way can be automatically managed and parallelised by the OPS library. Higher dimensions supported in the backend, but not currently by the code generators.
+Data and computations expressed this way can be automatically managed and parallelised by the OPS library. Higher dimensions are supported in the backend, but not currently by the code generators.
 
 ## Example Application
+In this tutorial we will use an example application, a simple 2D iterative Laplace equation solver. 
+* Go to the `OPS/apps/c/laplace2dtutorial/original` directory
+* Open the `laplace2d.cpp` file
+* It uses an $imax x jmax$ grid, with an additional 1 layers of boundary cells on all sides
+* There are a number of loops that set the boundary conditions along the four edges
+* The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
+* Compile and run the code !
 ## Original - Initialisation
 ## Original - Boundary loops
 ## Original - Main iteration

From 10d5691d742984f444c4d1f0018a5f37f36cd208 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 12:32:20 +0100
Subject: [PATCH 113/324] Update devanapp.md

---
 doc/devanapp.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 7ba242e157..591d4dbccc 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -17,7 +17,30 @@ In this tutorial we will use an example application, a simple 2D iterative Lapla
 * There are a number of loops that set the boundary conditions along the four edges
 * The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
 * Compile and run the code !
+
 ## Original - Initialisation
+The original code begins with initializing the data arrays used in the calculation. 
+```
+// Size  along  y
+int jmax = 4094;
+// Size  along  x
+int imax = 4094;
+
+int itermax = 100;
+double pi = 2.0∗asin(1.0);
+const double tol = 1.0e−6;
+double error = 1.0;
+
+double ∗A;
+double ∗Anew;
+double ∗y0;
+
+A    = (double ∗)malloc ((imax+2)∗(jmax+2)∗sizeof(double));
+Anew = (double ∗)malloc ((imax+2)∗(jmax+2)∗sizeof(double));
+y0   = (double ∗)malloc ((imax+2)∗sizeof(double));
+
+memset(A, 0, (imax+2)∗(jmax+2)∗sizeof(double));
+```
 ## Original - Boundary loops
 ## Original - Main iteration
 ## Build OPS

From b427078c5ad602016a2ad2887b0512294d0c2166 Mon Sep 17 00:00:00 2001
From: Istvan Reguly <regulyistvan@gmail.com>
Date: Tue, 12 Oct 2021 13:45:12 +0200
Subject: [PATCH 114/324] Update SYCL/HIP

---
 doc/installation.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index ae1f68b90e..97031296b2 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -32,10 +32,12 @@ The [CUDA](https://developer.nvidia.com/cuda-downloads) backend targets NVIDIA G
 
 **HIP**
 
-The HIP backend targets AMD GPUs which are supported by the ROCm stack
+The HIP backend targets AMD GPUs and NVIDIA GPUs which are supported by HIP - either through its CUDA support or the ROCm stack (tested with >=3.9). 
 
 **SYCL**
 
+The SYCL backend is currently in development and only working without MPI. It has been tested with Intel OneAPI (>=2021.1), Intel's public LLVM version, and hipSYCL (>=0.9.1), and runs on Intel CPUs and GPUs through Intel's OpenCL and Level Zero, NVIDIA and AMD GPUs both with the LLVM fork as well as hipSYCL. hipSYCL's OpenMP support covers most CPU architectures too.
+
 **Tridiagonal Solver**
 
 To use the tridiagonal solver OPS API in applications and build example applications such as `adi`, `adi_burger` and `adi_burger_3D` the open source tridiagonal solver (scalar) library needs to be cloned and built from the [Tridsolver repository](https://github.com/OP-DSL/tridsolver). 

From 8f5b340f2ef82b6bb37d6a7ea3028ebd97fcda78 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 12:57:03 +0100
Subject: [PATCH 115/324] Update devanapp.md

---
 doc/devanapp.md | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 591d4dbccc..ef674b9df8 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -13,13 +13,13 @@ Data and computations expressed this way can be automatically managed and parall
 In this tutorial we will use an example application, a simple 2D iterative Laplace equation solver. 
 * Go to the `OPS/apps/c/laplace2dtutorial/original` directory
 * Open the `laplace2d.cpp` file
-* It uses an $imax x jmax$ grid, with an additional 1 layers of boundary cells on all sides
+* It uses an $imax$x$jmax$ grid, with an additional 1 layers of boundary cells on all sides
 * There are a number of loops that set the boundary conditions along the four edges
 * The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
 * Compile and run the code !
 
 ## Original - Initialisation
-The original code begins with initializing the data arrays used in the calculation. 
+The original code begins with initializing the data arrays used in the calculation:
 ```
 // Size  along  y
 int jmax = 4094;
@@ -42,7 +42,46 @@ y0   = (double ∗)malloc ((imax+2)∗sizeof(double));
 memset(A, 0, (imax+2)∗(jmax+2)∗sizeof(double));
 ```
 ## Original - Boundary loops
+The application sen sets boundary conditions:
+```
+for (int i = 0; i < imax+2; i++)
+    A[(0)*(imax+2)+i]   = 0.0;
+
+for (int i = 0; i < imax+2; i++)
+    A[(jmax+1)*(imax+2)+i] = 0.0;
+
+for (int j = 0; j < jmax+2; j++) {
+    A[(j)*(imax+2)+0] = sin(pi * j / (jmax+1));
+}
+
+for (int j = 0; j < imax+2; j++) {
+    A[(j)*(imax+2)+imax+1] = sin(pi * j / (jmax+1))*exp(-pi);
+}
+```  
+Note how in the latter two loops the loop index is used.
+
 ## Original - Main iteration
+The main iterative loop is a while loop iterating until the error tolarance is at a set level and the number of iterations are les than the maximum set. 
+```
+while ( error > tol && iter < iter_max ) {
+  error = 0.0;
+  for( int j = 1; j < jmax+1; j++ ) {
+    for( int i = 1; i < imax+1; i++) {
+      Anew[(j)*(imax+2)+i] = 0.25f * 
+      ( A[(j)*(imax+2)+i+1] + A[(j)*(imax+2)+i-1]
+      + A[(j-1)*(imax+2)+i] + A[(j+1)*(imax+2)+i]);
+      error = fmax( error, fabs(Anew[(j)*(imax+2)+i]-A[(j)*(imax+2)+i]));
+      }
+    }
+    for( int j = 1; j < jmax+1; j++ ) {
+      for( int i = 1; i < imax+1; i++) {
+        A[(j)*(imax+2)+i] = Anew[(j)*(imax+2)+i];    
+      }
+    }
+    if(iter % 10 == 0) printf("%5d, %0.6f\n", iter, error);        
+    iter++;
+  }
+  ```
 ## Build OPS
 ## Step 1 - Preparing to use OPS
 ## Step 2 - OPS declarations

From 966aba3a68f30956f0635eda820eb1664ce8f05c Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 13:17:34 +0100
Subject: [PATCH 116/324] Update devanapp.md

---
 doc/devanapp.md | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index ef674b9df8..468c5c915d 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -21,25 +21,26 @@ In this tutorial we will use an example application, a simple 2D iterative Lapla
 ## Original - Initialisation
 The original code begins with initializing the data arrays used in the calculation:
 ```
-// Size  along  y
+//Size along y
 int jmax = 4094;
-// Size  along  x
+//Size along x
 int imax = 4094;
+//Size along x
+int iter_max = 100;
 
-int itermax = 100;
-double pi = 2.0∗asin(1.0);
-const double tol = 1.0e−6;
-double error = 1.0;
+double pi  = 2.0 * asin(1.0);
+const double tol = 1.0e-6;
+double error     = 1.0;
 
-double ∗A;
-double ∗Anew;
-double ∗y0;
+double *A;
+double *Anew;
+double *y0;
 
-A    = (double ∗)malloc ((imax+2)∗(jmax+2)∗sizeof(double));
-Anew = (double ∗)malloc ((imax+2)∗(jmax+2)∗sizeof(double));
-y0   = (double ∗)malloc ((imax+2)∗sizeof(double));
+A    = (double *)malloc((imax+2) * (jmax+2) * sizeof(double));
+Anew = (double *)malloc((imax+2) * (jmax+2) * sizeof(double));
+y0   = (double *)malloc((imax+2) * sizeof(double));
 
-memset(A, 0, (imax+2)∗(jmax+2)∗sizeof(double));
+memset(A, 0, (imax+2) * (jmax+2) * sizeof(double));
 ```
 ## Original - Boundary loops
 The application sen sets boundary conditions:

From b69081c8d987e0cac360e1176651d21336006283 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 13:20:33 +0100
Subject: [PATCH 117/324] Update devanapp.md

---
 doc/devanapp.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 468c5c915d..0cb450eb45 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -72,18 +72,20 @@ while ( error > tol && iter < iter_max ) {
       ( A[(j)*(imax+2)+i+1] + A[(j)*(imax+2)+i-1]
       + A[(j-1)*(imax+2)+i] + A[(j+1)*(imax+2)+i]);
       error = fmax( error, fabs(Anew[(j)*(imax+2)+i]-A[(j)*(imax+2)+i]));
-      }
     }
-    for( int j = 1; j < jmax+1; j++ ) {
-      for( int i = 1; i < imax+1; i++) {
-        A[(j)*(imax+2)+i] = Anew[(j)*(imax+2)+i];    
-      }
+  }
+  for( int j = 1; j < jmax+1; j++ ) {
+    for( int i = 1; i < imax+1; i++) {
+      A[(j)*(imax+2)+i] = Anew[(j)*(imax+2)+i];    
     }
-    if(iter % 10 == 0) printf("%5d, %0.6f\n", iter, error);        
-    iter++;
   }
-  ```
+  if(iter % 10 == 0) printf("%5d, %0.6f\n", iter, error);        
+  iter++;
+}
+```
 ## Build OPS
+Build OPS using instructions in the [Getting Started](https://ops-dsl.readthedocs.io/en/markdowndocdev/installation.html#getting-started) page. 
+
 ## Step 1 - Preparing to use OPS
 ## Step 2 - OPS declarations
 ## Step 3 - First parallel loop

From c282d969871c6fa5e01274e73676266d1a75c634 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 13:28:57 +0100
Subject: [PATCH 118/324] Update devanapp.md

---
 doc/devanapp.md | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 0cb450eb45..9298fd44d7 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -13,7 +13,7 @@ Data and computations expressed this way can be automatically managed and parall
 In this tutorial we will use an example application, a simple 2D iterative Laplace equation solver. 
 * Go to the `OPS/apps/c/laplace2dtutorial/original` directory
 * Open the `laplace2d.cpp` file
-* It uses an $imax$x$jmax$ grid, with an additional 1 layers of boundary cells on all sides
+* It uses an $imax$ x $jmax$ grid, with an additional 1 layers of boundary cells on all sides
 * There are a number of loops that set the boundary conditions along the four edges
 * The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
 * Compile and run the code !
@@ -87,6 +87,24 @@ while ( error > tol && iter < iter_max ) {
 Build OPS using instructions in the [Getting Started](https://ops-dsl.readthedocs.io/en/markdowndocdev/installation.html#getting-started) page. 
 
 ## Step 1 - Preparing to use OPS
+Firstly, include the appropriate header files, then initialise OPS, and at the end finalise it.
+* Define that this application is 2D, include the OPS header file, and create a header file where the outlined "elemental kernels" will live.
+```
+#define OPS_2D
+#include <ops_seq.h>
+#include "laplace_kernels.h" 
+```
+* Initialise and finalise OPS
+```  
+int main(int argc, const char** argv) {
+  //Initialise the OPS library, passing runtime args, and setting diagnostics level to low (1)
+  ops_init(argc, argv,1);
+  ...
+  ...
+  //Finalising the OPS library
+  ops_exit();
+}  
+```  
 ## Step 2 - OPS declarations
 ## Step 3 - First parallel loop
 ## Step 4 - Indexes and global constants

From 491c94e9a41d691fec46fa61e31b8016fa5ea8d9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 13:31:15 +0100
Subject: [PATCH 119/324] Update devanapp.md

---
 doc/devanapp.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 9298fd44d7..31a785f549 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -105,6 +105,8 @@ int main(int argc, const char** argv) {
   ops_exit();
 }  
 ```  
+By this point you need OPS set up - take a look at the Makefile in step1, and observ that the include and library paths are added, and we link against `ops_seq`.
+
 ## Step 2 - OPS declarations
 ## Step 3 - First parallel loop
 ## Step 4 - Indexes and global constants

From e00c0f7eb1a2132f4eec9a39edd819f17b610084 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 14:14:24 +0100
Subject: [PATCH 120/324] Update devanapp.md

---
 doc/devanapp.md | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 31a785f549..4a35bce518 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -105,9 +105,38 @@ int main(int argc, const char** argv) {
   ops_exit();
 }  
 ```  
-By this point you need OPS set up - take a look at the Makefile in step1, and observ that the include and library paths are added, and we link against `ops_seq`.
+By this point you need OPS set up - take a look at the Makefile in step1, and observe that the include and library paths are added, and we link against `ops_seq`.
 
 ## Step 2 - OPS declarations
+Now declare a block and data on the block :
+```
+//The 2D block
+ops_block block = ops_decl_block(2, "my_grid");
+
+//The two datasets
+int size[] = {imax, jmax};
+int base[] = {0,0};
+int d_m[] = {-1,-1};
+int d_p[] = {1,1};
+ops_dat d_A    = ops_decl_dat(block, 1, size, base,
+                               d_m, d_p, A,    "double", "A");
+ops_dat d_Anew = ops_decl_dat(block, 1, size, base,
+                               d_m, d_p, Anew, "double", "Anew");
+```
+Data sets have a size (number of mesh points in each dimension). There is passing for halos or boundaries in the positive (`d_p`) and negative directions (`d_m`). Here we use a 1 thick boundary layer. Base index can be defined as it may be different from 0 (e.g. in Fortran). Item these with a 0 base index and a 1 wide halo, these datasets can be indexed from −1 tosize +1.
+
+OPS supports gradual conversion of applications to its API, but in this case the described data sizes will need to match:  the allocated memory and its extents need to be correctly described to OPS. In this example we have two `(imax+ 2) ∗ (jmax+ 2)` size arrays, and the total size in each dimension needs to matchsize `[i] + d_p[i] − d_m[i]`.  This is only supported for the sequential and OpenMP backends. If a `NULL` pointer is passed, OPS will allocate the data internally.
+
+We also need to declare the stencils that will be used - in this example most loops use a simple 1-point stencil, and one uses a 5-point stencil:
+```
+//Two stencils, a 1-point, and a 5-point
+int s2d_00[] = {0,0};
+ops_stencil S2D_00 = ops_decl_stencil(2,1,s2d_00,"0,0");
+int s2d_5pt[] = {0,0, 1,0, -1,0, 0,1, 0,-1};
+ops_stencil S2D_5pt = ops_decl_stencil(2,5,s2d_5pt,"5pt");
+```  
+Different names may be used for stencils in your code, but we suggest using some convention.
+
 ## Step 3 - First parallel loop
 ## Step 4 - Indexes and global constants
 ## Step 5 - Complex stencils and reductions

From fd0133b8262eb6ac50ad811916a1d7d7c79e39ca Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 14:25:17 +0100
Subject: [PATCH 121/324] Update devanapp.md

---
 doc/devanapp.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 4a35bce518..d62385c434 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -138,6 +138,27 @@ ops_stencil S2D_5pt = ops_decl_stencil(2,5,s2d_5pt,"5pt");
 Different names may be used for stencils in your code, but we suggest using some convention.
 
 ## Step 3 - First parallel loop
+You can now convert the first loop to use OPS:
+```
+for (int i = 0; i < imax+2; i++)
+    A[(0)*(imax+2)+i]   = 0.0;
+```    
+This is a loop on the ottom boundary of the domain, which is at the −1 index for our dataset, therefore our iteration range will be over the entire domain, including halos in the X direction, and the bottom boundary in the Y direction.  The iteration range is given as beginning (inclusive) and end (exclusive) indices in the x, y, etc.  directions.
+```
+int bottom_range[] = {-1, imax+1, -1, 0};
+```
+Next, we need to outline the “elemental” into `laplacekernels.h`, and place the appropriate access objects - `ACC<double> &A`, in the kernel’s formal parameter list, and `(i,j)` are the stencil offsets in the X and Y directions respectively:
+```
+void set_zero(ACC<double> &A) {
+  A(0,0) = 0.0;
+}
+```
+The OPS parallel loop can now be written as follows:
+```
+ops_par_loop(set_zero, "set_zero", block, 2, bottom_range,
+      ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE));
+```
+The loop will execute `set_zero` at each mesh point defined in the iteration range, and write the dataset `d_A` with the 1-point stencil. The `ops_par_loop` implies that the order in which grid points will be executed will not affect the end result (within machine precision).
 ## Step 4 - Indexes and global constants
 ## Step 5 - Complex stencils and reductions
 ## Step 6 - Handing it all to OPS

From d950549bafe6f78272d9a5a2d2926ec4d8e51af8 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 14:26:03 +0100
Subject: [PATCH 122/324] Update devanapp.md

---
 doc/devanapp.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index d62385c434..f0c5d1fa44 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -4,7 +4,7 @@ This page provides a tutorial in the basics of using OPS for multi-block structu
 ## OPS Abstraction
 OPS is a Domain Specific Language embedded in C/C++ and Fortran, targeting the development of multi-block structured mesh computations. The abstraction has two distinct components:  the definition of the mesh, and operations over the mesh.
 * Defining a number of 1-3D blocks, and on them a number of datasets, which have specific extents in the different dimensions.
-* Describing a parallel loop over a given block, with a given iteration range, executing a given "kernel function" at each grid point, and describing what datasets are going to be accessed and how.
+* Describing a parallel loop over a given block, with a given iteration range, executing a given "kernel function" at each mesh point, and describing what datasets are going to be accessed and how.
 * Additionally, one needs to declare stencils (access patterns) that will be used in parallel loops to access datasets, and any global constants (read-only global scope variables)
 
 Data and computations expressed this way can be automatically managed and parallelised by the OPS library. Higher dimensions are supported in the backend, but not currently by the code generators.
@@ -13,7 +13,7 @@ Data and computations expressed this way can be automatically managed and parall
 In this tutorial we will use an example application, a simple 2D iterative Laplace equation solver. 
 * Go to the `OPS/apps/c/laplace2dtutorial/original` directory
 * Open the `laplace2d.cpp` file
-* It uses an $imax$ x $jmax$ grid, with an additional 1 layers of boundary cells on all sides
+* It uses an $imax$ x $jmax$ mesh, with an additional 1 layers of boundary cells on all sides
 * There are a number of loops that set the boundary conditions along the four edges
 * The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
 * Compile and run the code !
@@ -158,7 +158,7 @@ The OPS parallel loop can now be written as follows:
 ops_par_loop(set_zero, "set_zero", block, 2, bottom_range,
       ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE));
 ```
-The loop will execute `set_zero` at each mesh point defined in the iteration range, and write the dataset `d_A` with the 1-point stencil. The `ops_par_loop` implies that the order in which grid points will be executed will not affect the end result (within machine precision).
+The loop will execute `set_zero` at each mesh point defined in the iteration range, and write the dataset `d_A` with the 1-point stencil. The `ops_par_loop` implies that the order in which mesh points will be executed will not affect the end result (within machine precision).
 ## Step 4 - Indexes and global constants
 ## Step 5 - Complex stencils and reductions
 ## Step 6 - Handing it all to OPS

From 04396ea5227681855320cd00698dca7c4fa52025 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 14:30:14 +0100
Subject: [PATCH 123/324] Update devanapp.md

---
 doc/devanapp.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index f0c5d1fa44..6eb4d65411 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -159,6 +159,20 @@ ops_par_loop(set_zero, "set_zero", block, 2, bottom_range,
       ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE));
 ```
 The loop will execute `set_zero` at each mesh point defined in the iteration range, and write the dataset `d_A` with the 1-point stencil. The `ops_par_loop` implies that the order in which mesh points will be executed will not affect the end result (within machine precision).
+
+There are three more loops which set values to zero, they can be trivially replaced with the code above, only altering the iteration range. In the main while loop, the second simpler loop simply copies data from one array to another, this time on the interior of the domain:
+```
+int interior_range[] = {0,imax,0,jmax};
+ops_par_loop(copy, "copy", block, 2, interior_range,
+    ops_arg_dat(d_A,    1, S2D_00, "double", OPS_WRITE),
+    ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_READ));
+```
+And the corresponding outlined elemental kernel is as follows:
+```
+void copy(ACC<double> &A, const ACC<double> &Anew) {
+  A(0,0) = Anew(0,0);
+}
+```
 ## Step 4 - Indexes and global constants
 ## Step 5 - Complex stencils and reductions
 ## Step 6 - Handing it all to OPS

From 1851283583fc9a8043a1e3ebfe076d9dd260e5b7 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 15:57:12 +0100
Subject: [PATCH 124/324] Update devanapp.md

---
 doc/devanapp.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 6eb4d65411..45342263e3 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -174,6 +174,31 @@ void copy(ACC<double> &A, const ACC<double> &Anew) {
 }
 ```
 ## Step 4 - Indexes and global constants
+There are two sets of boundary loops which use the loop variable j - this is a common technique to initialise data, such as coordinates `(x = i∗dx)`. OPS has a special argument `ops_arg_idx` which gives us a globally coherent (including over MPI) iteration index - between the bounds supplied in the iteration range.
+```
+ops_par_loop(left_bndcon, "left_bndcon", block, 2, left_range,
+      ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE),
+      ops_arg_idx());
+```
+And the corresponding outlined user kernel is as follows.  Observe the `idx` argument and the +1 offset due to the difference in indexing:
+```
+void left_bndcon(ACC<double> &A, const int *idx) {
+  A(0,0) = sin(pi * (idx[1]+1) / (jmax+1));
+}
+```
+This kernel also uses two variables,`jmax` and `pi` that do not depend on the iteration index - they are iteration space invariant.  OPS has two ways of supporting this:
+1. Global scope constants, through `ops_decl_const`, as done in this example: we need to move the declaration of the `imax`,`jmax` and `pi` variables to global scope (outside of main), and call the OPS API:
+```
+//declare and define global constants
+ops_decl_const("imax",1,"int",&imax);
+ops_decl_const("jmax",1,"int",&jmax);
+ops_decl_const("pi",1,"double",&pi);
+```
+These ariables do not need to be passed in to the elemental kernel, they are accessible in all elemental kernels.
+
+2. The other option is to explicitly pass it to the elemental kernel with `ops_arg_gbl`:  this is for scalars and small arrays that should not be in global scope.
+
+
 ## Step 5 - Complex stencils and reductions
 ## Step 6 - Handing it all to OPS
 ## Step 7 - Code generation

From ac9ed0c056c9af983b88091a3b8aa5d0a6c2da21 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 16:17:51 +0100
Subject: [PATCH 125/324] Update devanapp.md

---
 doc/devanapp.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 45342263e3..01adc41505 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -187,6 +187,7 @@ void left_bndcon(ACC<double> &A, const int *idx) {
 }
 ```
 This kernel also uses two variables,`jmax` and `pi` that do not depend on the iteration index - they are iteration space invariant.  OPS has two ways of supporting this:
+
 1. Global scope constants, through `ops_decl_const`, as done in this example: we need to move the declaration of the `imax`,`jmax` and `pi` variables to global scope (outside of main), and call the OPS API:
 ```
 //declare and define global constants
@@ -200,6 +201,21 @@ These ariables do not need to be passed in to the elemental kernel, they are acc
 
 
 ## Step 5 - Complex stencils and reductions
+There is only one loop left, which uses a 5 point stencil and a reduction.  It can be outlined as usual, and for the stencil, we will use `S2Dpt5`.
+```
+ops_par_loop(apply_stencil, "apply_stencil", block, 2, interior_range,
+        ops_arg_dat(d_A,    1, S2D_5pt, "double", OPS_READ),
+        ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE),
+        ops_arg_reduce(h_err, 1, "double", OPS_MAX))
+```
+And the corresponding outlined elemental kernel is as follows.  Observe the stencil offsets used to access the adjacent 4 points:
+```
+void apply_stencil(const ACC<double> &A, ACC<double> &Anew, double *error) {
+  Anew(0,0) = 0.25f * ( A(1,0) + A(-1,0)
+      + A(0,-1) + A(0,1));
+  *error = fmax( *error, fabs(Anew(0,0)-A(0,0)));
+}
+```
 ## Step 6 - Handing it all to OPS
 ## Step 7 - Code generation
 ## Code generated versions

From 1a95cf9fba5c81a539d6e46e12928d1acc477563 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 16:53:24 +0100
Subject: [PATCH 126/324] Update devanapp.md

---
 doc/devanapp.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 01adc41505..8ca7c83cfe 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -188,7 +188,7 @@ void left_bndcon(ACC<double> &A, const int *idx) {
 ```
 This kernel also uses two variables,`jmax` and `pi` that do not depend on the iteration index - they are iteration space invariant.  OPS has two ways of supporting this:
 
-1. Global scope constants, through `ops_decl_const`, as done in this example: we need to move the declaration of the `imax`,`jmax` and `pi` variables to global scope (outside of main), and call the OPS API:
+1) Global scope constants, through `ops_decl_const`, as done in this example: we need to move the declaration of the `imax`,`jmax` and `pi` variables to global scope (outside of main), and call the OPS API:
 ```
 //declare and define global constants
 ops_decl_const("imax",1,"int",&imax);
@@ -197,7 +197,7 @@ ops_decl_const("pi",1,"double",&pi);
 ```
 These ariables do not need to be passed in to the elemental kernel, they are accessible in all elemental kernels.
 
-2. The other option is to explicitly pass it to the elemental kernel with `ops_arg_gbl`:  this is for scalars and small arrays that should not be in global scope.
+2) The other option is to explicitly pass it to the elemental kernel with `ops_arg_gbl`:  this is for scalars and small arrays that should not be in global scope.
 
 
 ## Step 5 - Complex stencils and reductions

From acfb88ba9ab0d798a34d14466407f5193aa00596 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 17:06:38 +0100
Subject: [PATCH 127/324] Update devanapp.md

---
 doc/devanapp.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 8ca7c83cfe..2465204f5d 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -216,6 +216,13 @@ void apply_stencil(const ACC<double> &A, ACC<double> &Anew, double *error) {
   *error = fmax( *error, fabs(Anew(0,0)-A(0,0)));
 }
 ```
+The loop also has a special argument for the reduction, `ops_arg_reduce`.  As the first argument, it takes a reduction handle, which has to be defined separately:
+```
+//Reduction handle
+ops_reduction h_err = ops_decl_reduction_handle(sizeof(double), "double", "error");
+``` 
+Reductions may be increment (`OPS_INC`), min (`OPS_MIN`) or max(`OPS_MAX`). The user kernel will have to perform the reduction operation, reducing the passed in value as well as the computed value.
+
 ## Step 6 - Handing it all to OPS
 ## Step 7 - Code generation
 ## Code generated versions

From 7fed24468d1672f2923734d4b779132dfe37fd5b Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 17:45:44 +0100
Subject: [PATCH 128/324] Update devanapp.md

---
 doc/devanapp.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 2465204f5d..f0bb88a59a 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -221,7 +221,15 @@ The loop also has a special argument for the reduction, `ops_arg_reduce`.  As th
 //Reduction handle
 ops_reduction h_err = ops_decl_reduction_handle(sizeof(double), "double", "error");
 ``` 
-Reductions may be increment (`OPS_INC`), min (`OPS_MIN`) or max(`OPS_MAX`). The user kernel will have to perform the reduction operation, reducing the passed in value as well as the computed value.
+Reductions may be increment (`OPS_INC`), min (`OPS_MIN`) or max (`OPS_MAX`). The user kernel will have to perform the reduction operation, reducing the passed in value as well as the computed value.
+
+The result of the reduction can be queried from the handle as follows:
+```
+ ops_reduction_result(h_err, &error);
+```
+
+Multiple parallel loops may use the same handle, and their results will be combined, until the result is queried by the user.  Parallel loops that only have the reduction handle in common are semantically independent.
+
 
 ## Step 6 - Handing it all to OPS
 ## Step 7 - Code generation

From b616d3a419f370facf3389284517fa35b747cf44 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 17:47:30 +0100
Subject: [PATCH 129/324] Update devanapp.md

---
 doc/devanapp.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index f0bb88a59a..4e1bd73308 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -218,7 +218,6 @@ void apply_stencil(const ACC<double> &A, ACC<double> &Anew, double *error) {
 ```
 The loop also has a special argument for the reduction, `ops_arg_reduce`.  As the first argument, it takes a reduction handle, which has to be defined separately:
 ```
-//Reduction handle
 ops_reduction h_err = ops_decl_reduction_handle(sizeof(double), "double", "error");
 ``` 
 Reductions may be increment (`OPS_INC`), min (`OPS_MIN`) or max (`OPS_MAX`). The user kernel will have to perform the reduction operation, reducing the passed in value as well as the computed value.

From 16ff1aa5169feda9e1471da3d74d0c3ec69302c3 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 14:03:50 +0100
Subject: [PATCH 130/324] Update devanapp.md

---
 doc/devanapp.md | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 4e1bd73308..dbbbcf063f 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -229,10 +229,34 @@ The result of the reduction can be queried from the handle as follows:
 
 Multiple parallel loops may use the same handle, and their results will be combined, until the result is queried by the user.  Parallel loops that only have the reduction handle in common are semantically independent.
 
-
 ## Step 6 - Handing it all to OPS
+
+We have now successfully converted all computations on the mesh to OPS parallel loops. In order for OPS to manage data and parallelisations better, we should let OPS allocate the datasets - instead of passing in the pointers to memory allocated by us, we just pass in NULL (`A` and `Anew`). Parallel I/O can be done using HDF5 - see the ops_hdf5.h header.
+
+All data and parallelisation is now handed to OPS. We can now also compile the developer MPI version of the code - see the Makefile, and try building `laplace2d_mpi`.
+
 ## Step 7 - Code generation
+
+Now that the developer versions of our code work, it’s time to generate code.  On the console, type: 
+```
+$OPSINSTALLPATH/../ops_translator/c/ops.py laplace2d.cpp
+```
+We have provided a Makefile which can use several different compilers (intel, cray, pgi, clang), we suggest modifying it for your own applications. Try building CUDA, OpenMP, MPI+CUDA, MPI+OpenMP, and other versions of the code. You can take a look at the generated kernels for different parallelisations under the appropriate subfolders. 
+
+If you add the−`OPS_DIAGS=2` runtime flag, at the end of execution, OPS will report timings and achieved bandwidth for each of your kernels. For more options, see the user guide.
+
+
 ## Code generated versions
+OPS will generate and compile a large number of different versions.
+* `laplace2d_dev_seq` and `laplace2d_dev_mpi` :  these do not use code generation, they are intended for development only
+* `laplace2d_seq` and `laplace2d_mpi` : baseline sequential and MPI implementations
+* `laplace2d_openmp` : baseline OpenMP implementation
+* `laplace2d_cuda`, `laplace2d_opencl`, `laplace2d_openacc` : implementations targeting GPUs 
+* `laplace2d_mpiinline` : optimised implementation with MPI+OpenMP
+* `laplace2d_tiled`: optimised implementation with OpenMP that improves spatial and temporal locality
+
+
+
 ## Optimizations - general
 ## Optimizations - tiling
 

From d00b503642a1c4519ab552e5bebdba82798c55b2 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 14:17:15 +0100
Subject: [PATCH 131/324] Update devanapp.md

---
 doc/devanapp.md | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index dbbbcf063f..6179f2693c 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -243,7 +243,7 @@ $OPSINSTALLPATH/../ops_translator/c/ops.py laplace2d.cpp
 ```
 We have provided a Makefile which can use several different compilers (intel, cray, pgi, clang), we suggest modifying it for your own applications. Try building CUDA, OpenMP, MPI+CUDA, MPI+OpenMP, and other versions of the code. You can take a look at the generated kernels for different parallelisations under the appropriate subfolders. 
 
-If you add the−`OPS_DIAGS=2` runtime flag, at the end of execution, OPS will report timings and achieved bandwidth for each of your kernels. For more options, see the user guide.
+If you add the−`OPS_DIAGS=2` runtime flag, at the end of execution, OPS will report timings and achieved bandwidth for each of your kernels. For more options, see [Runtime Flags and Options](https://ops-dsl.readthedocs.io/en/markdowndocdev/devanapp.html#runtime-flags-and-options).
 
 
 ## Code generated versions
@@ -255,11 +255,24 @@ OPS will generate and compile a large number of different versions.
 * `laplace2d_mpiinline` : optimised implementation with MPI+OpenMP
 * `laplace2d_tiled`: optimised implementation with OpenMP that improves spatial and temporal locality
 
+## Optimizations - general
+Try the following performance tuning options
+* `laplace2d_cuda`, `laplace2d_opencl` : you can set the `OPS_BLOCK_SIZE_X` and `OPS_BLOCK_SIZE_Y` runtime arguments to control thread block or work group sizes 
+* `laplace2d_mpi_cuda`, `laplace2d_mpi_openacc` : add the `-gpudirect` runtime flag to enable GPU Direct communications
 
 
-## Optimizations - general
 ## Optimizations - tiling
 
+Tiling uses lazy execution: as parallel loops follow one another, they are not executed, but put in a queue, and only once some data needs to be returned to the user (e.g.  result of a reduction) do these loops have to be executed.
+
+With a chain of loops queued, OPS can analyse them together and come up with a tiled execution schedule.
+
+This works over MPI as well:  OPS extends the halo regions, and does one big halo exchange instead of several smaller ones. In the current `laplace2d` code, every stencil application loop is also doing a reduction, therefore only two loops are queued. Try modifying the code so the reduction only happens every 10 iterations ! On A Xeon E5-2650, one can get a 2.5x speedup.
+
+The following versions can be executed with the tiling optimzations.
+
+* `laplace2d_tiled`, `laplace2d_mpi_tiled` : add the `OPS_TILING` runtime flag, and move `-OPSDIAGS=3` to see the cache blocking tiling at work. For some applications, such as this one, the initial guess gives too large tiles, try setting `OPS_CACHE_SIZE` to a lower value (in MB, for L3 size).  Thread affinity control and using 1 process per socket isstrongly recommended.  E.g. `OMP_NUM_THREADS=20 numactl--cpunodebind=0 ./laplace2dtiled -OPSDIAGS=3 OPS_TILING OPS_CACHE_SIZE=5`. Over MPI, you will have to set `OPS_TILING_MAX_DEPTH` to extend halo regions.
+
 ## Supported Paralleizations
 ## Code-generation Flags
 ## Runtime Flags and Options

From 09d09e157e28524c87b42be356eeb075b96c865e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 14:22:21 +0100
Subject: [PATCH 132/324] Update devanapp.md

---
 doc/devanapp.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 6179f2693c..dbb72e4401 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -267,12 +267,14 @@ Tiling uses lazy execution: as parallel loops follow one another, they are not e
 
 With a chain of loops queued, OPS can analyse them together and come up with a tiled execution schedule.
 
-This works over MPI as well:  OPS extends the halo regions, and does one big halo exchange instead of several smaller ones. In the current `laplace2d` code, every stencil application loop is also doing a reduction, therefore only two loops are queued. Try modifying the code so the reduction only happens every 10 iterations ! On A Xeon E5-2650, one can get a 2.5x speedup.
+This works over MPI as well:  OPS extends the halo regions, and does one big halo exchange instead of several smaller ones. In the current `laplace2d` code, every stencil application loop is also doing a reduction, therefore only two loops are queued. Try modifying the code so the reduction only happens every 10 iterations ! On a Xeon E5-2650, one can get a 2.5x speedup.
 
 The following versions can be executed with the tiling optimzations.
 
 * `laplace2d_tiled`, `laplace2d_mpi_tiled` : add the `OPS_TILING` runtime flag, and move `-OPSDIAGS=3` to see the cache blocking tiling at work. For some applications, such as this one, the initial guess gives too large tiles, try setting `OPS_CACHE_SIZE` to a lower value (in MB, for L3 size).  Thread affinity control and using 1 process per socket isstrongly recommended.  E.g. `OMP_NUM_THREADS=20 numactl--cpunodebind=0 ./laplace2dtiled -OPSDIAGS=3 OPS_TILING OPS_CACHE_SIZE=5`. Over MPI, you will have to set `OPS_TILING_MAX_DEPTH` to extend halo regions.
 
 ## Supported Paralleizations
-## Code-generation Flags
+
+<!-- ## Code-generation Flags !>
+
 ## Runtime Flags and Options

From 9e59b3cf560606a8360badb749c58018ff494c64 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 14:24:52 +0100
Subject: [PATCH 133/324] Update devanapp.md

---
 doc/devanapp.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index dbb72e4401..a9cd82bde4 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -275,6 +275,6 @@ The following versions can be executed with the tiling optimzations.
 
 ## Supported Paralleizations
 
-<!-- ## Code-generation Flags !>
+<!-- ## Code-generation Flags -->
 
 ## Runtime Flags and Options

From 87eefdb5989be7df4beff1d7edb759bc1ce7d81d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 16:07:15 +0100
Subject: [PATCH 134/324] Update introduction.md

---
 doc/introduction.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index ac9e9b2da0..1e25b5f3e1 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -2,7 +2,10 @@
 
 ## Overview
 
-[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language (eDSL) for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+
+The current OPS eDSL supports generating code targeting multi-core/multi-threaded CPUs, many-core GPUs and clusters of CPUs and GPUs using a range of paralleization models including SIMD vectorization, OpenMP, CUDA, OpenCL, OpenACC and their combinations with MPI. There is also experimental support for paralleizations using SYCL and AMD HIP. Various optimizations for each paralleization can be generated automatically, including cache blocking tiling to improve locality. The OPS API and library can also be used to solve scalar multi-dimensional tridiagonal systems using the [tridsolver](https://github.com/OP-DSL/tridsolver) library.
+
 
 ## Licencing
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.

From 98f3f911f340a592e0f62a72f9262567f53f213e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 16:32:41 +0100
Subject: [PATCH 135/324] Update devanapp.md

---
 doc/devanapp.md | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index a9cd82bde4..209bd295ed 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -273,8 +273,25 @@ The following versions can be executed with the tiling optimzations.
 
 * `laplace2d_tiled`, `laplace2d_mpi_tiled` : add the `OPS_TILING` runtime flag, and move `-OPSDIAGS=3` to see the cache blocking tiling at work. For some applications, such as this one, the initial guess gives too large tiles, try setting `OPS_CACHE_SIZE` to a lower value (in MB, for L3 size).  Thread affinity control and using 1 process per socket isstrongly recommended.  E.g. `OMP_NUM_THREADS=20 numactl--cpunodebind=0 ./laplace2dtiled -OPSDIAGS=3 OPS_TILING OPS_CACHE_SIZE=5`. Over MPI, you will have to set `OPS_TILING_MAX_DEPTH` to extend halo regions.
 
-## Supported Paralleizations
+<!--## Supported Paralleizations-->
 
 <!-- ## Code-generation Flags -->
 
 ## Runtime Flags and Options
+
+### General flags
+* `OPS_DIAGS=`
+* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`
+* `-gpudirect`
+* `OPS_CL_DEVICE=`
+* `OPS_TILING`
+* `OPS_TILING_MAXDEPTH=`
+
+### Tridsolver API flags
+* -halo 1 
+* -m
+* `-bx`, `-by` and `-bz`
+
+
+
+

From c1d7a940a05c131ca99019ade595c21a4a9ba5c5 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 16:48:40 +0100
Subject: [PATCH 136/324] Update devdoc.md

---
 doc/devdoc.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index e959787cdd..85425fa416 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -9,6 +9,16 @@
 ### HDF5
 ### CUDA
 ### Cache blocking tiling and comm-avoiding optimizations
+
+## To contribute to OPS please use the following steps :
+
+Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).
+Create a new branch in your cloned repository
+Make changes / contributions in your new branch
+Submit your changes by creating a Pull Request to the `develop` branch of the OPS repository
+The contributions in the `develop` branch will be merged into the `master` branch as we create a new release.
+
+<!--
 ## Git work flow for contribution
 To facilitate the concept of "Version" and "Release", we adopt the [Gitflow Workflow model](#https://nvie.com/posts/a-successful-git-branching-model/).
 ### Overall work flow
@@ -41,4 +51,5 @@ Using the Gitflow model tends to produce a few long-live branches (e.g.,  featur
 **The Gitflow tool will automatically delete a branch once it is finished.**
 ### Gitflow tool
 
-see https://github.com/nvie/gitflow
\ No newline at end of file
+see https://github.com/nvie/gitflow
+-->

From 5399d39dc39438428b0427aee5176d0ad5d7e6a8 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 19 Oct 2021 16:51:11 +0100
Subject: [PATCH 137/324] Update devanapp.md

---
 doc/devanapp.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 209bd295ed..03a556144b 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -288,8 +288,8 @@ The following versions can be executed with the tiling optimzations.
 * `OPS_TILING_MAXDEPTH=`
 
 ### Tridsolver API flags
-* -halo 1 
-* -m
+* `-halo 1`
+* `-m`
 * `-bx`, `-by` and `-bz`
 
 

From 9d39a659a7c973d212381f6fcf90f525a1d73e1c Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 16:38:55 +0100
Subject: [PATCH 138/324] Update pubs.md

---
 doc/pubs.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/pubs.md b/doc/pubs.md
index 9aca12e2f5..8c6f967998 100644
--- a/doc/pubs.md
+++ b/doc/pubs.md
@@ -1,2 +1,3 @@
 
 # Publications
+See [OP-DSL publications page](https://op-dsl.github.io/papers.html).

From fa07e3354a05e6b731c284d516108014af47623e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 16:56:41 +0100
Subject: [PATCH 139/324] Update devdoc.md

---
 doc/devdoc.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index 85425fa416..fdc401efe3 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -1,4 +1,6 @@
 # Developer Guide
+Under construction.
+<!-- 
 ## Code-generator
 ### Frontend API parser
 ### Target Parallel Templates
@@ -9,7 +11,7 @@
 ### HDF5
 ### CUDA
 ### Cache blocking tiling and comm-avoiding optimizations
-
+-->
 ## To contribute to OPS please use the following steps :
 
 Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).

From 0d4ec0d3684f7fb2c437c62e5289564922f59342 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 16:59:16 +0100
Subject: [PATCH 140/324] Update devdoc.md

---
 doc/devdoc.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index fdc401efe3..3d8efb5abc 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -14,10 +14,11 @@ Under construction.
 -->
 ## To contribute to OPS please use the following steps :
 
-Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).
-Create a new branch in your cloned repository
-Make changes / contributions in your new branch
-Submit your changes by creating a Pull Request to the `develop` branch of the OPS repository
+1. Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).
+2. Create a new branch in your cloned repository
+3. Make changes / contributions in your new branch
+4. Submit your changes by creating a Pull Request to the `develop` branch of the OPS repository
+
 The contributions in the `develop` branch will be merged into the `master` branch as we create a new release.
 
 <!--

From 8d37d27bdcc8f5587390a71f03db17d2a253a97a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:02:29 +0100
Subject: [PATCH 141/324] Update devdoc.md

---
 doc/devdoc.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index 3d8efb5abc..9f8b4f6246 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -12,8 +12,9 @@ Under construction.
 ### CUDA
 ### Cache blocking tiling and comm-avoiding optimizations
 -->
-## To contribute to OPS please use the following steps :
+## Contributing
 
+To contribute to OPS please use the following steps :
 1. Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).
 2. Create a new branch in your cloned repository
 3. Make changes / contributions in your new branch

From 09d3b8d991d70e0e2f9f152f6e48832930bd0113 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:07:09 +0100
Subject: [PATCH 142/324] Update perf.md

---
 doc/perf.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index be069ba51b..b723960e8b 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -45,7 +45,6 @@ arguments. For example,
 ```bash
 ./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4
 ```
-## CUDA-aware MPI
 ## OpenCL arguments
 
 `OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the

From 97733c819c8dda241736f4343dc537a68b0d76a1 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:25:58 +0100
Subject: [PATCH 143/324] Create numawrap

---
 scripts/numawrap | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 scripts/numawrap

diff --git a/scripts/numawrap b/scripts/numawrap
new file mode 100644
index 0000000000..e36467b975
--- /dev/null
+++ b/scripts/numawrap
@@ -0,0 +1,37 @@
+#!/bin/bash
+# e.g. mpirun -np 4 numawrap ./application
+
+# Find the rank of the process from the MPI local rank environment variable
+# to ensure unique output filenames.
+if [[ -n ${OMPI_COMM_WORLD_LOCAL_RANK} ]]
+    then
+    let lrank=${OMPI_COMM_WORLD_LOCAL_RANK}
+elif [[ -n ${MV2_COMM_WORLD_LOCAL_RANK} ]]
+    then
+    let lrank=${MV2_COMM_WORLD_LOCAL_RANK}
+elif [[ -n ${PMI_RANK} ]]
+    then
+    let lrank=${PMI_RANK}
+elif [[ -n ${PMI_ID} && -n ${MPISPAWN_LOCAL_NPROCS} ]]
+    then
+    let lrank=${PMI_ID}%${PERHOST}
+elif [[ -n ${MPI_LOCALRANKID} ]]
+    then
+    let lrank=${MPI_LOCALRANKID}
+else
+    echo could not determine local rank
+fi
+
+export CUDA_VISIBLE_DEVICES=${lrank}
+
+# let lrank=${PMI_RANK}
+echo $lrank
+
+# use  $lrank -lt 2 and  $lrank -ge 2 to distribute and bind 4 procs on to 2 CPUs
+if [[ $lrank -lt 2 ]]; then
+    numactl --cpunodebind=0 ${@}
+fi
+
+if [[ $lrank -ge 2 ]]; then
+    numactl --cpunodebind=1 ${@}
+fi

From ac8d7ca792f5a946dfcbcf11ce25c4a0e8721cfb Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:28:12 +0100
Subject: [PATCH 144/324] Update perf.md

---
 doc/perf.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/doc/perf.md b/doc/perf.md
index b723960e8b..f90d15b29e 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -1,6 +1,6 @@
 # Performance Tuning
 
-## Vectorization
+<!--## Vectorization-->
 
 ## Executing with GPUDirect
 
@@ -22,7 +22,7 @@ When MPI is combined with OpenMP tiling can be extended to the MPI
 halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
 halos for multiple `ops_par_loops` can be exchanged with a single MPI
 message (see [@TPDS2017] for more details)\
-To test, compile CloverLeaf under ``apps/c/CloverLeaf``, modify clover.in
+To test, compile CloverLeaf under ``OPS/apps/c/CloverLeaf``, modify clover.in
 to use a $6144^2$ mesh, then run as follows:\
 For OpenMP with tiling:
 ```bash
@@ -37,7 +37,9 @@ OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:
 ```bash
 export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200
 ```
-## OpenMP with MPI
+## OpenMP and OpenMP+MPI
+It is recommended that you assign one MPI rank per NUMA region when executing MPI+OpenMP parallel code. Usually for a multi-CPU system a single CPU socket is a single NUMA region. Thus, for a 4 socket system, OPS's MPI+OpenMP code should be executed with 4 MPI processes with each MPI process having multiple OpenMP threads (typically specified by the `OMP_NUM_THREAD`s flag). Additionally on some systems using `numactl` to bind threads to cores could give performance improvements (see `OPS/scripts/numawrap` for an example script that wraps the `numactl` command to be used with common MPI distributions). 
+
 ## CUDA arguments
 The CUDA (and OpenCL) thread block sizes can be controlled by setting
 the ``OPS_BLOCK_SIZE_X``, ``OPS_BLOCK_SIZE_Y`` and ``OPS_BLOCK_SIZE_Z`` runtime
@@ -45,8 +47,8 @@ arguments. For example,
 ```bash
 ./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4
 ```
-## OpenCL arguments
 
+## OpenCL arguments
 `OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the
 code on.
 

From 81b78a71c473926738c602c96526bdeb4ffe28cc Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:35:47 +0100
Subject: [PATCH 145/324] Update devanapp.md

---
 doc/devanapp.md | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 03a556144b..40607e76a4 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -273,24 +273,7 @@ The following versions can be executed with the tiling optimzations.
 
 * `laplace2d_tiled`, `laplace2d_mpi_tiled` : add the `OPS_TILING` runtime flag, and move `-OPSDIAGS=3` to see the cache blocking tiling at work. For some applications, such as this one, the initial guess gives too large tiles, try setting `OPS_CACHE_SIZE` to a lower value (in MB, for L3 size).  Thread affinity control and using 1 process per socket isstrongly recommended.  E.g. `OMP_NUM_THREADS=20 numactl--cpunodebind=0 ./laplace2dtiled -OPSDIAGS=3 OPS_TILING OPS_CACHE_SIZE=5`. Over MPI, you will have to set `OPS_TILING_MAX_DEPTH` to extend halo regions.
 
-<!--## Supported Paralleizations-->
 
-<!-- ## Code-generation Flags -->
-
-## Runtime Flags and Options
-
-### General flags
-* `OPS_DIAGS=`
-* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`
-* `-gpudirect`
-* `OPS_CL_DEVICE=`
-* `OPS_TILING`
-* `OPS_TILING_MAXDEPTH=`
-
-### Tridsolver API flags
-* `-halo 1`
-* `-m`
-* `-bx`, `-by` and `-bz`
 
 
 

From 77c0b0e983e3e2166156ecdfb3cc932d1eb0a777 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:38:52 +0100
Subject: [PATCH 146/324] Update opsapi.md

---
 doc/opsapi.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index b8c9a908e7..d3b78dd7ac 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -683,5 +683,22 @@ This routine copies the data given  by the user to the internal data structure u
 |part|        the chunk index (has to be 0)|
 |data|        pointer to memory which should be copied to OPS |
 
+## Runtime Flags and Options
+
+The following is a list of all the runtime flags and options that can be used when executing OPS generated applications. 
+### General flags
+* `OPS_DIAGS=`
+* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`
+* `-gpudirect`
+* `OPS_CL_DEVICE=`
+* `OPS_TILING`
+* `OPS_TILING_MAXDEPTH=`
+
+### Tridsolver API flags
+* `-halo 1`
+* `-m`
+* `-bx`, `-by` and `-bz`
+
+
 ## Doxygen
 Doxygen generated from OPS source can be found [here](https://op-dsl-ci.gitlab.io/ops-ci/).

From 19c4192032893d8662c41365db1e419055daa2d4 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:44:38 +0100
Subject: [PATCH 147/324] Update apps.md

---
 doc/apps.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/apps.md b/doc/apps.md
index 5bee74c5de..0b8fca0efb 100644
--- a/doc/apps.md
+++ b/doc/apps.md
@@ -1,5 +1,11 @@
 # Examples
-## CloverLeaf 2D 
+
+See `OPS/apps/[c|fortran]/[application]/test.sh` on compiling and running various parallel versions generated by OPS for each application.
+
+Further documentation under construction. 
+
+<!-- ## CloverLeaf 2D 
 ## CloverLeaf 3D with HDF5
 ## poisson
 ## adi
+-->

From 7366cacfc7f8b0b6494ffbdef07f9ecf048cc49a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 22:07:22 +0100
Subject: [PATCH 148/324] Update perf.md

---
 doc/perf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index f90d15b29e..822160a597 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -21,7 +21,7 @@ in Megabytes. To manually specify the tile sizes, use the
 When MPI is combined with OpenMP tiling can be extended to the MPI
 halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
 halos for multiple `ops_par_loops` can be exchanged with a single MPI
-message (see [@TPDS2017] for more details)\
+message (see [TPDS2017](https://ieeexplore.ieee.org/abstract/document/8121995) for more details)\
 To test, compile CloverLeaf under ``OPS/apps/c/CloverLeaf``, modify clover.in
 to use a $6144^2$ mesh, then run as follows:\
 For OpenMP with tiling:

From 0c6948f1389d886062ab625bd8587b497ec76b51 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 22:09:28 +0100
Subject: [PATCH 149/324] Update perf.md

---
 doc/perf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index 822160a597..1cec2e145f 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -38,7 +38,7 @@ OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:
 export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200
 ```
 ## OpenMP and OpenMP+MPI
-It is recommended that you assign one MPI rank per NUMA region when executing MPI+OpenMP parallel code. Usually for a multi-CPU system a single CPU socket is a single NUMA region. Thus, for a 4 socket system, OPS's MPI+OpenMP code should be executed with 4 MPI processes with each MPI process having multiple OpenMP threads (typically specified by the `OMP_NUM_THREAD`s flag). Additionally on some systems using `numactl` to bind threads to cores could give performance improvements (see `OPS/scripts/numawrap` for an example script that wraps the `numactl` command to be used with common MPI distributions). 
+It is recommended that you assign one MPI rank per NUMA region when executing MPI+OpenMP parallel code. Usually for a multi-CPU system a single CPU socket is a single NUMA region. Thus, for a 4 socket system, OPS's MPI+OpenMP code should be executed with 4 MPI processes with each MPI process having multiple OpenMP threads (typically specified by the `OMP_NUM_THREAD` flag). Additionally on some systems using `numactl` to bind threads to cores could give performance improvements (see `OPS/scripts/numawrap` for an example script that wraps the `numactl` command to be used with common MPI distributions). 
 
 ## CUDA arguments
 The CUDA (and OpenCL) thread block sizes can be controlled by setting

From 263630f81ab8e39f574cc419cbd6715e15be36d0 Mon Sep 17 00:00:00 2001
From: Istvan Reguly <regulyistvan@gmail.com>
Date: Thu, 21 Oct 2021 16:57:45 +0200
Subject: [PATCH 150/324] C API clarification

---
 doc/opsapi.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index d3b78dd7ac..3562a11453 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -27,7 +27,7 @@ also under MPI).
 
 Reductions in loops are done using the `ops_arg_reduce` argument, which takes a reduction handle as an argument. The result of the reduction can then be acquired using a separate call to `ops_reduction_result`. The semantics are the following: a reduction handle after it was declared is in an "uninitialised" state. The first time it is used as an argument to a loop, its type is determined (increment/min/max), and is initialised appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in parallel loops are combined together, up until the point, where the result is acquired using `ops_reduction_result`, which then sets it back to an uninitialised state. This also implies, that different parallel loops, which all use the same reduction handle, but are otherwise independent, are independent and their partial reduction results can be combined together associatively and commutatively.
 
-OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **any data accesses or manipulation must only be done through the OPS API**.
+OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **data accesses or manipulation should only be done through the OPS API**. There is an external data access API that allows access to the data stored by OPS which in turn allows interfacing with external libraries.
 
 This restriction is exploited by a lazy execution mechanism in OPS. The idea is that OPS API calls that do not return a result need not be executed immediately, rather queued, and once an API call requires returning some data, operations in the queue are executed, and the result is returned. This allows OPS to analyse and optimise operations
 in the queue together. This mechanism is fully automated by OPS, and is used with the various `_tiled` executables. For more information on how to use this mechanism for improving CPU performance, see Section on Tiling. Some API calls triggering the execution of queued operations include `ops_reduction_result`, and the functions in the
@@ -43,7 +43,7 @@ To further clarify some of the important issues encountered when designing the O
 
 OPS handle all of these different requirements through stencil definitions.
 
-## C/C++ API
+## C API
 
 ### Initialisation and termination routines
 
@@ -567,7 +567,7 @@ in a tech-report on checkpointing, to be published later.
 
 ### Access to OPS data
 
-his section describes APIS that give the user access to internal data structures in OPS and return data to user-space. These should be used cautiously and sparsely, as they can affect performance significantly
+This section describes APIs that give the user access to internal data structures in OPS and return data to user-space. These should be used cautiously and sparsely, as they can affect performance significantly
 
 #### ops_dat_get_local_npartitions
 

From 51d6ee7a14cf7ec2d880f540c3ce4cda9480e8a3 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 26 Oct 2021 10:28:42 +0100
Subject: [PATCH 151/324] Update introduction.md

---
 doc/introduction.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 1e25b5f3e1..d12c564010 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -2,9 +2,11 @@
 
 ## Overview
 
-[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language (eDSL) for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language (eDSL) for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. 
 
-The current OPS eDSL supports generating code targeting multi-core/multi-threaded CPUs, many-core GPUs and clusters of CPUs and GPUs using a range of paralleization models including SIMD vectorization, OpenMP, CUDA, OpenCL, OpenACC and their combinations with MPI. There is also experimental support for paralleizations using SYCL and AMD HIP. Various optimizations for each paralleization can be generated automatically, including cache blocking tiling to improve locality. The OPS API and library can also be used to solve scalar multi-dimensional tridiagonal systems using the [tridsolver](https://github.com/OP-DSL/tridsolver) library.
+The current OPS eDSL supports generating code targeting multi-core/multi-threaded CPUs, many-core GPUs and clusters of CPUs and GPUs using a range of paralleization models including SIMD vectorization, OpenMP, CUDA, OpenCL, OpenACC and their combinations with MPI. There is also experimental support for paralleizations using SYCL and AMD HIP. Various optimizations for each paralleization can be generated automatically, including cache blocking tiling to improve locality. The OPS API and library can also be used to solve multi-dimensional tridiagonal systems using the [tridsolver](https://github.com/OP-DSL/tridsolver) library.
+
+These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
 
 
 ## Licencing

From 7838d1cc9e7a483dbf37ec6fb61490026ac17946 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 26 Oct 2021 10:29:22 +0100
Subject: [PATCH 152/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index d12c564010..c3a73f687a 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -10,7 +10,7 @@ These pages provide detailed documentation on using OPS, including an installati
 
 
 ## Licencing
-OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
+OPS is released as an open-source project under the BSD 3-Clause License. See the [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) file for more information.
 
 ## Citing
 To cite OPS, please reference the following paper:

From bce2c0d540e631787df4dcf267d05eadc0159a3c Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 26 Oct 2021 10:31:30 +0100
Subject: [PATCH 153/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index c3a73f687a..b458afae3d 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -36,4 +36,4 @@ The preferred method of reporting bugs and issues with OPS is to submit an issue
 ## Funding
 The development of OPS was in part supported by the UK Engineering and Physical Sciences Research Council (EPSRC) grants [EP/K038494/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/K038494/1) (“Future-proof massively-parallel execution of multi-block applications”), [EP/J010553/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/J010553/1) (“Software for Emerging Architectures - ASEArch"), The UK Turbulence Consortium grant [EP/T026170/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/T026170/1), The Janos Bolyai Research Scholarship of the Hungarian Academy of Sciences, the Royal Society through their Industry Fellowship Scheme (INF/R1/180012), and the Thematic Research Cooperation Establishing Innovative Informatic and Info-communication Solutions Project, which has been supported by the European Union and co-financed by the European Social Fund under grant number EFOP-3.6.2-16-2017-00013. Research funding support was also provided by the UK AWE under grants CDK0660 ("The Production of Predictive Models for Future Computing Requirements"), CDK0724 ("AWE Technical Outreach Programme"), AWE grant for "High-level Abstractions for Performance, Portability and Continuity of Scientific Software on Future Computing Systems" and the Numerical Algorithms Group [NAG](https://www.nag.com/).
 
-Hardware resources for development and testing provided by the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725, the [ARCHER](http://www.archer.ac.uk) and ARCHER2(https://www.archer2.ac.uk/) UK National Supercomputing Service, [University of Oxford Advanced Research Computing (ARC) facility](http://dx.doi.org/10.5281/zenodo.22558) and through hardware donations and access provided by NVIDIA and Intel.
+Hardware resources for development and testing provided by the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725, the [ARCHER](http://www.archer.ac.uk) and [ARCHER2](https://www.archer2.ac.uk/) UK National Supercomputing Service, [University of Oxford Advanced Research Computing (ARC) facility](http://dx.doi.org/10.5281/zenodo.22558) and through hardware donations and access provided by NVIDIA and Intel.

From cd3a75c97e1002c49d0246544e1984419ba0fb58 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 26 Oct 2021 10:47:12 +0100
Subject: [PATCH 154/324] Update installation.md

---
 doc/installation.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 97031296b2..e685820de7 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -4,6 +4,8 @@
 
 ## Dependencies
 
+The following prerequisites and dependencies are required for building OPS. Building each of the **backends** are optional and depends on the hardware and/or capabilities you will be targeting. 
+
   **CMake**
 
 CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
@@ -26,19 +28,19 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 
 [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using `-DHDF5_ROOT`.
 
- **CUDA**
+ **CUDA Backend**
  
 The [CUDA](https://developer.nvidia.com/cuda-downloads) backend targets NVIDIA GPUs with a compute capability of 3.0 or greater. The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
-**HIP**
+**HIP Backend**
 
-The HIP backend targets AMD GPUs and NVIDIA GPUs which are supported by HIP - either through its CUDA support or the ROCm stack (tested with >=3.9). 
+The HIP backend targets AMD GPUs and NVIDIA GPUs which are supported by HIP - either through its CUDA support or the [ROCm](https://rocmdocs.amd.com/en/latest/) stack (tested with >=3.9). 
 
-**SYCL**
+**SYCL Backend**
 
-The SYCL backend is currently in development and only working without MPI. It has been tested with Intel OneAPI (>=2021.1), Intel's public LLVM version, and hipSYCL (>=0.9.1), and runs on Intel CPUs and GPUs through Intel's OpenCL and Level Zero, NVIDIA and AMD GPUs both with the LLVM fork as well as hipSYCL. hipSYCL's OpenMP support covers most CPU architectures too.
+The [SYCL](https://www.khronos.org/sycl/) backend is currently in development and only working without MPI. It has been tested with Intel OneAPI (>=2021.1), Intel's public LLVM version, and hipSYCL (>=0.9.1), and runs on Intel CPUs and GPUs through Intel's OpenCL and Level Zero, NVIDIA and AMD GPUs both with the LLVM fork as well as hipSYCL. hipSYCL's OpenMP support covers most CPU architectures too.
 
-**Tridiagonal Solver**
+**Tridiagonal Solver Backend**
 
 To use the tridiagonal solver OPS API in applications and build example applications such as `adi`, `adi_burger` and `adi_burger_3D` the open source tridiagonal solver (scalar) library needs to be cloned and built from the [Tridsolver repository](https://github.com/OP-DSL/tridsolver). 
 ```bash

From 6188b29a9dc5a4c11bdf560caa5905b160706f3a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 26 Oct 2021 10:49:44 +0100
Subject: [PATCH 155/324] Update installation.md

---
 doc/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index e685820de7..cbc04f505a 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -127,7 +127,7 @@ The current tests are mainly based on the applications.
   * `MPI_INSTALL_PATH` - Installation directory of MPI (to build MPI based distributed memory libs and applications)
   * `HDF5_INSTALL_PATH` - Installation directory of HDF5 (to support HDF5 based File I/O)
 
-See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) under `OPS/ops/scripts` that sets up the environment for building with various compilers (Intel, PGI, Cray).
+See example scripts (e.g. `source_intel`, `source_pgi_15.10`, `source_cray`) under `OPS/ops/scripts` that sets up the environment for building with various compilers (Intel, PGI, Cray).
 
 #### Build back-end library
 For C/C++ back-end use Makefile under `OPS/ops/c` (modify Makefile if required). The libraries will be built in `OPS/ops/c/lib`

From 0cdf8bf10c9d9fe3f92b53026dcef920f9b2851c Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Fri, 29 Oct 2021 17:36:53 +0100
Subject: [PATCH 156/324] Minor upgrade on Doxyfile

---
 doc/ops/Doxyfile | 253 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 172 insertions(+), 81 deletions(-)

diff --git a/doc/ops/Doxyfile b/doc/ops/Doxyfile
index 32f7c733c8..a116d6466f 100644
--- a/doc/ops/Doxyfile
+++ b/doc/ops/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.11
+# Doxyfile 1.8.16
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -17,11 +17,11 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "OPS"
+PROJECT_NAME           = OPS
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -179,6 +187,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = YES
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -226,7 +244,12 @@ TAB_SIZE               = 4
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
 # "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
@@ -264,17 +287,26 @@ OPTIMIZE_FOR_FORTRAN   = YES
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
 # language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is
+# Fortran), use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
@@ -285,7 +317,7 @@ EXTENSION_MAPPING      = inc=Fortran
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -293,6 +325,15 @@ EXTENSION_MAPPING      = inc=Fortran
 
 MARKDOWN_SUPPORT       = YES
 
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -309,7 +350,7 @@ AUTOLINK_SUPPORT       = YES
 # diagrams that involve STL classes more complete and accurate.
 # The default value is: NO.
 
-BUILTIN_STL_SUPPORT    = NO
+BUILTIN_STL_SUPPORT    = YES
 
 # If you use Microsoft's C++/CLI language, you should set this option to YES to
 # enable parsing support.
@@ -318,7 +359,7 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -424,6 +465,12 @@ EXTRACT_ALL            = YES
 
 EXTRACT_PRIVATE        = NO
 
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
@@ -502,7 +549,7 @@ INTERNAL_DOCS          = NO
 # names in lower-case letters. If set to YES, upper-case letters are also
 # allowed. This is useful if you have classes or files whose names only differ
 # in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# (including Cygwin) ands Mac users are advised to set this option to NO.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
@@ -512,7 +559,7 @@ CASE_SENSE_NAMES       = YES
 # scope will be hidden.
 # The default value is: NO.
 
-HIDE_SCOPE_NAMES       = NO
+HIDE_SCOPE_NAMES       = YES
 
 # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
 # append additional text to a page's title, such as Class Reference. If set to
@@ -689,7 +736,7 @@ LAYOUT_FILE            =
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -734,7 +781,8 @@ WARN_IF_DOC_ERROR      = YES
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
 # value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
@@ -771,12 +819,16 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = ../README.md ../ops/c/src ../ops/c/include ../ops/fortran/src ../ops/fortran/include
+INPUT                  = ../README.md \
+                         ../ops/c/src \
+                         ../ops/c/include \
+                         ../ops/fortran/src \
+                         ../ops/fortran/include
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
 # possible encodings.
 # The default value is: UTF-8.
 
@@ -793,8 +845,8 @@ INPUT_ENCODING         = UTF-8
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
-# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          =
 
@@ -949,7 +1001,7 @@ INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -981,12 +1033,12 @@ SOURCE_TOOLTIPS        = YES
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1014,7 +1066,7 @@ VERBATIM_HEADERS       = YES
 # rich C++ code for which doxygen's built-in parser lacks the necessary type
 # information.
 # Note: The availability of this option depends on whether or not doxygen was
-# generated with the -Duse-libclang=ON option for CMake.
+# generated with the -Duse_libclang=ON option for CMake.
 # The default value is: NO.
 
 CLANG_ASSISTED_PARSING = NO
@@ -1027,6 +1079,16 @@ CLANG_ASSISTED_PARSING = NO
 
 CLANG_OPTIONS          =
 
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1145,7 +1207,7 @@ HTML_EXTRA_FILES       =
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1181,6 +1243,17 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 HTML_TIMESTAMP         = NO
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1204,13 +1277,13 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# environment (see: https://developer.apple.com/xcode/), introduced with OSX
+# 10.5 (Leopard). To create a documentation set, doxygen will generate a
 # Makefile in the HTML output directory. Running make will produce the docset in
 # that directory and running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1249,7 +1322,7 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
 # Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
@@ -1325,7 +1398,7 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1333,7 +1406,7 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
 # folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
@@ -1342,7 +1415,7 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1350,7 +1423,7 @@ QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1358,7 +1431,7 @@ QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
@@ -1416,7 +1489,7 @@ DISABLE_INDEX          = NO
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-GENERATE_TREEVIEW      = NO
+GENERATE_TREEVIEW      = YES
 
 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
 # doxygen will group on one line in the generated HTML documentation.
@@ -1451,7 +1524,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
@@ -1463,7 +1536,7 @@ FORMULA_FONTSIZE       = 10
 FORMULA_TRANSPARENT    = YES
 
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side Javascript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1490,8 +1563,8 @@ MATHJAX_FORMAT         = NativeMML
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
@@ -1501,7 +1574,8 @@ MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
+MATHJAX_EXTENSIONS     = TeX/AMSmath \
+                         TeX/AMSsymbols
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1552,7 +1626,7 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see: https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1565,7 +1639,7 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
@@ -1617,21 +1691,35 @@ LATEX_OUTPUT           = ops/latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1752,7 +1840,7 @@ LATEX_SOURCE_CODE      = NO
 
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1766,6 +1854,14 @@ LATEX_BIB_STYLE        = plain
 
 LATEX_TIMESTAMP        = NO
 
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1805,9 +1901,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1816,8 +1912,8 @@ RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
@@ -1903,6 +1999,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1935,9 +2038,9 @@ DOCBOOK_PROGRAMLISTING = NO
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -2037,7 +2140,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS
+PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2104,12 +2207,6 @@ EXTERNAL_GROUPS        = YES
 
 EXTERNAL_PAGES         = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
@@ -2123,15 +2220,6 @@ PERL_PATH              = /usr/bin/perl
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
@@ -2150,7 +2238,7 @@ HIDE_UNDOC_RELATIONS   = YES
 # http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
-# The default value is: YES.
+# The default value is: NO.
 
 HAVE_DOT               = YES
 
@@ -2306,9 +2394,7 @@ DIRECTORY_GRAPH        = YES
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
-# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
-# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
-# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
 # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
 # png:gdiplus:gdiplus.
 # The default value is: png.
@@ -2361,6 +2447,11 @@ DIAFILE_DIRS           =
 
 PLANTUML_JAR_PATH      =
 
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 

From b5d76335c517d69c3270f0aeda941f31f1145cb0 Mon Sep 17 00:00:00 2001
From: Toby Flynn <toby_flynn@btinternet.com>
Date: Mon, 1 Nov 2021 17:50:02 +0000
Subject: [PATCH 157/324] OPS-Tridsolver docs

---
 doc/opsapi.md | 56 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 3562a11453..dc8d6b5463 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -663,6 +663,7 @@ A single call to ops_dat_release_raw_data() releases all pointers obtained by pr
 #### ops_dat_fetch_data
 
 __void ops_dat_fetch_data(ops_dat dat, int part, int *data)__
+
 This routine copies the data held by OPS to the user-specified memory location, which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
 
 | Arguments      | Description |
@@ -683,9 +684,56 @@ This routine copies the data given  by the user to the internal data structure u
 |part|        the chunk index (has to be 0)|
 |data|        pointer to memory which should be copied to OPS |
 
+### Tridsolver Calls
+This section specifies APIs that allow [Tridsolver](https://github.com/OP-DSL/tridsolver) (a tridiagonal solver library) to be called from OPS. The library can be used to solve a large number of tridiagonal systems of equations stored in multidimensional datasets. Parameters that are passed to Tridsolver from OPS are stored in an `ops_tridsolver_params` object. The constructor for this class takes the `ops_block` that the datasets are defined over as an argument and optionally also a solving strategy to use (only relevant to MPI applications). The following solving strategies are available (see Tridsolver for more details about these):
+
+- GATHER_SCATTER (not available for GPUs)
+- ALLGATHER
+- LATENCY_HIDING_TWO_STEP
+- LATENCY_HIDING_INTERLEAVED
+- JACOBI
+- PCR (default)
+
+Then parameters specific to different solving strategies can be set using setter methods. For applications using MPI, it is beneficial to reuse `ops_tridsolver_params` objects between solves as much as possible due to set up times involved with creating Tridsolver's MPI communicators.
+
+#### ops_tridMultiDimBatch
+
+__void ops_tridMultiDimBatch(int ndim, int solvedim, int* dims, ops_dat a, ops_dat b, ops_dat c, ops_dat d, ops_tridsolver_params *tridsolver_ctx)__
+
+This solves multiple tridiagonal systems of equations in multidimensional datasets along the specified dimension. The matrix is stored in the `a` (bottom diagonal), `b` (central diagonal) and `c` (top diagonal) datasets. The right hand side is stored in the `d` dataset and the result is also written to this dataset.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndim| the dimension of the datasets |
+|solvedim| the dimension to solve along |
+|dims| the size of each dimension (excluding any padding) |
+|a| the dataset for the lower diagonal |
+|b| the dataset for the central diagonal |
+|c| the dataset for the upper diagonal |
+|d| the dataset for the right hand side, also where the solution is written to |
+|tridsolver_ctx| an object containing the parameters for the Tridsolver library |
+
+#### ops_tridMultiDimBatch_Inc
+
+__void ops_tridMultiDimBatch(int ndim, int solvedim, int* dims, ops_dat a, ops_dat b, ops_dat c, ops_dat d, ops_dat u, ops_tridsolver_params *tridsolver_ctx)__
+
+This solves multiple tridiagonal systems of equations in multidimensional datasets along the specified dimension. The matrix is stored in the `a` (bottom diagonal), `b` (central diagonal) and `c` (top diagonal) datasets. The right hand side is stored in the `d` dataset and the result is added to the `u` dataset.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndim| the dimension of the datasets |
+|solvedim| the dimension to solve along |
+|dims| the size of each dimension (excluding any padding) |
+|a| the dataset for the lower diagonal |
+|b| the dataset for the central diagonal |
+|c| the dataset for the upper diagonal |
+|d| the dataset for the right hand side |
+|u| the dataset that the soluion is added to |
+|tridsolver_ctx| an object containing the parameters for the Tridsolver library |
+
 ## Runtime Flags and Options
 
-The following is a list of all the runtime flags and options that can be used when executing OPS generated applications. 
+The following is a list of all the runtime flags and options that can be used when executing OPS generated applications.
 ### General flags
 * `OPS_DIAGS=`
 * `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`
@@ -694,11 +742,5 @@ The following is a list of all the runtime flags and options that can be used wh
 * `OPS_TILING`
 * `OPS_TILING_MAXDEPTH=`
 
-### Tridsolver API flags
-* `-halo 1`
-* `-m`
-* `-bx`, `-by` and `-bz`
-
-
 ## Doxygen
 Doxygen generated from OPS source can be found [here](https://op-dsl-ci.gitlab.io/ops-ci/).

From 83a6b9038beebea474b3a06caa2b4458b85a1c45 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Wed, 3 Nov 2021 12:34:32 +0000
Subject: [PATCH 158/324] C++ API instance block

---
 doc/opsapi.md | 95 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 78 insertions(+), 17 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 3562a11453..ec79b0b90f 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -43,11 +43,15 @@ To further clarify some of the important issues encountered when designing the O
 
 OPS handle all of these different requirements through stencil definitions.
 
-## C API
+## OPS C and C++ API
 
-### Initialisation and termination routines
+Both C and C++ styles API are provided for utilizing the capabilities provided by the OPS library. They are essentially the same although there are minor differences in syntax. The C++ API is mainly designed for data abstraction, which  therefore provides better data encapsulation and the support of multiple instances and threading (OpenMP currently). In the following both C style routines and C++ class and methods will be introduced according their functionality with a notice (C) or (C++). If there is no such notice, the routine will apply to both.
+
+To enable the C++ API, a compiler directive ``OPS_CPP_API`` is required.
 
-#### ops_init
+### Initialisation and termination routines
+#### C Style
+##### ops_init
 
 __void ops_init(int argc, char** argv, int diags_level)__
 
@@ -77,10 +81,26 @@ MPI receives depth (for OPS internal development only)
 __void ops_exit()__
 
 This routine must be called last to cleanly terminate the OPS computation.
+#### C++ style
+
+With the C++ style APIs, all data structures (block, data and stencils etc ) are encapsulated into a class  ``OPS_instance``. Thus, we can allocate multiple instances of ``OPS_instance`` by using the class constructor, for example,
+
+```c++
+// Allocate an instance
+OPS_instance *instance = new OPS_instance(argc,argv,1,ss);
+```
+
+where the meaning of arguments are same to the C API, while the extra argument (i.e., ss) is for accpeting the messages.
+
+An explicit termination is not needed for the C++ API, although we need to "delete" the instance in if it is allocated through pointer, i.e.,
+```C++
+delete instance;
+```
 
 ### Declaration routines
 
-#### ops_decl_block
+#### Block
+##### ops_decl_block (C)
 
 __ops_block ops_decl_block(int dims, char *name)__
 
@@ -90,7 +110,15 @@ This routine defines a structured grid block.
 | dims    | dimension of the block    |
 | name  |  a name used for output diagnostics |
 
-#### ops_decl_block_hdf5
+##### OPS_instance::decl_block (C++)
+
+A method of the OPS_instance class for declaring a block, which accepts same arguments with the C style function. A OPS_instance object should be constructed before this. The method returns a pointer to a ops_block type variable, where ops_block is an alias to a pointer type of ops_block_core. An example is
+
+```C++
+ops_block grid2D = instance->decl_block(2, "grid2D");
+```
+
+##### ops_decl_block_hdf5 (C)
 
 __ops_block ops_decl_block_hdf5(int dims, char *name, char *file)__
 
@@ -108,7 +136,8 @@ arguments, it is included here for error checking (e.g. check if blocks
 defined in an HDF5 file is matching with the declared arguments in an
 application) and completeness.
 
-#### ops_decl_dat
+#### Dat (ops_cat_core)
+##### ops_decl_dat (C)
 
 __ops_dat ops_decl_dat(ops block block, int dim, int *size, int *base, int *dm, int *d p, T *data, char *type, char *name)__
 
@@ -131,7 +160,16 @@ The `size` allows to declare different sized data arrays on a given
 indicate the offset from the edge of a block (in both the negative and
 positive directions of each dimension).
 
-#### ops_decl_dat_hdf5
+##### ops_block_core::decl_dat (C++)
+The method ops_block_core::decl_dat is used to define a ops_dat object, which accepts almost same arguments with the C conterpart where the block argument is not necessary, e.g.,
+```C++
+//declare ops_dat with dim = 2
+ops_dat dat0    = grid2D->decl_dat(2, size, base, d_m, d_p, temp, "double", "dat0");
+ops_dat dat1    = grid2D->decl_dat(2, size, base, d_m, d_p, temp, "double", "dat1");
+```
+where grid2D is a ops_block_core object which shall be defined before this.
+
+##### ops_decl_dat_hdf5 (C)
 
 __ops_dat ops_decl_dat_hdf5(ops_block block, int dim, char *type, char *name, char *file)__
 
@@ -145,12 +183,13 @@ type    |  the name of type used for output diagnostics (e.g. ``double``,``float
 |name   |   name of the dat used for output diagnostics|
 |file   |   hdf5 file to read and obtain the data from|
 
-#### ops_decl_const
+#### Global constant
+##### ops_decl_const (C)
 
 __void ops_decl_const(char const * name, int dim, char const * type, T * data )__
 
 This routine defines a global constant: a variable in global scope. Global constants need to be declared upfront
- so that they can be correctly handled for different parallelizations. For e.g CUDA on GPUs. Once defined
+ so that they can be correctly handled for different parallelization. For e.g CUDA on GPUs. Once defined
  they remain unchanged throughout the program, unless changed by a call to ops_update_const(..). The ``name'' and``type''
  parameters **must** be string literals since they are used in the code generation step
 
@@ -161,7 +200,12 @@ This routine defines a global constant: a variable in global scope. Global const
 |type |          the name of type used for output diagnostics (e.g. ``double``, ``float``) |
 |data |          pointer to input data of type *T* |
 
-#### ops_decl_halo
+##### OPS_instance::decl_const (C++)
+
+The method accepts same arguments with its C counterpart.
+
+#### Halo definition
+##### ops_decl_halo (C)
 
 __ops_halo ops_decl_halo(ops_dat from, ops_dat to, int *iter_size, int* from_base, int *to_base, int *from_dir, int *to_dir)__
 
@@ -183,7 +227,10 @@ iter_size = \[2,100,100\], from_base = \[1,0,0\], to_base = \[-1,0,0\],
 from_dir = \[0,1,2\], to_dir = \[0,1,2\]. In more complex case this
 allows for transfers between blocks with different orientations.)
 
-#### ops_decl_halo_hdf5
+##### OPS_instance::decl_halo (C++)
+The method accepts same arguments with its C counterpart.
+
+##### ops_decl_halo_hdf5 (C)
 
 __ops_halo ops_decl_halo_hdf5(ops_dat from, ops_dat to, char* file)__
 
@@ -195,7 +242,7 @@ This routine reads in a halo relationship between two datasets defined on two di
 |to|        destination dataset|
 |file|      hdf5 file to read and obtain the data from|
 
-#### ops_decl_halo_group
+##### ops_decl_halo_group (C)
 
 __ops_halo_group ops_decl_halo_group(int nhalos, ops_halo *halos)__
 
@@ -205,7 +252,12 @@ This routine defines a collection of halos. Semantically, when an exchange is tr
 |nhalos|         number of halos in *halos* |
 |halos|           array of halos|
 
-#### ops_decl_reduction_handle
+##### OPS_instance::decl_halo_group (C++)
+
+The method accepts same arguments with its C counterpart.
+
+#### Reduction handle
+##### ops_decl_reduction_handle (C)
 
 __ops_reduction ops_decl_reduction_handle(int size, char *type, char *name)__
 This routine defines a reduction handle to be used in a parallel loop
@@ -222,7 +274,10 @@ __{void ops_reduction_result(ops_reduction handle, T *result)
 |handle|  the *ops_reduction* handle |
 |result|  a pointer to write the results to, memory size has to match the declared |
 
-#### ops_partition
+##### OPS_instance::decl_reduction_handle (C++)
+The method accepts same arguments with its C counterpart.
+#### Partition
+##### ops_partition (C)
 
 __ops_partition(char *method)__
 
@@ -233,15 +288,21 @@ and ops_halo ops_decl_dat statements have been declared
 | ----------- | ----------- |
 |method| string describing the partitioning method. Currently this string is not used internally, but is simply a place-holder to indicate different partitioning methods in the future. |
 
+
+##### OPS_instance::partition (C++)
+
+The method accepts same arguments with its C counterpart.
 ### Diagnostic and output routines
 
-#### ops_diagnostic_output
+#### ops_diagnostic_output (C)
 
 __void ops_diagnostic_output()__
 
 This routine prints out various useful bits of diagnostic info about sets, mappings and datasets. Usually used right
 after an ops_partition() call to print out the details of the decomposition
 
+#### OPS_instance::diagnostic_output (C++)
+Same to the C counterpart.
 #### ops_printf
 
 __void ops_printf(const char * format, ...)__
@@ -329,7 +390,7 @@ is found, prints an error message and exits.
 
 ### Halo exchange
 
-#### ops_halo_transfer
+#### ops_halo_transfer (C)
 
 __void ops_halo_transfer(ops_halo_group group)__
 
@@ -685,7 +746,7 @@ This routine copies the data given  by the user to the internal data structure u
 
 ## Runtime Flags and Options
 
-The following is a list of all the runtime flags and options that can be used when executing OPS generated applications. 
+The following is a list of all the runtime flags and options that can be used when executing OPS generated applications.
 ### General flags
 * `OPS_DIAGS=`
 * `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`

From 75ee4cd8fb6e08e1764e746f944d01b0ea70e537 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Thu, 4 Nov 2021 09:52:18 +0000
Subject: [PATCH 159/324] C++ API Data Access

---
 doc/opsapi.md | 98 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 33 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 00551b64ed..4ab720fceb 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -45,7 +45,7 @@ OPS handle all of these different requirements through stencil definitions.
 
 ## OPS C and C++ API
 
-Both C and C++ styles API are provided for utilizing the capabilities provided by the OPS library. They are essentially the same although there are minor differences in syntax. The C++ API is mainly designed for data abstraction, which  therefore provides better data encapsulation and the support of multiple instances and threading (OpenMP currently). In the following both C style routines and C++ class and methods will be introduced according their functionality with a notice (C) or (C++). If there is no such notice, the routine will apply to both.
+Both C and C++ styles API are provided for utilizing the capabilities provided by the OPS library. They are essentially the same although there are minor differences in syntax. The C++ API is mainly designed for data abstraction, which  therefore provides better data encapsulation and the support of multiple instances and threading (OpenMP currently). In the following both C style routines and C++ class and methods will be introduced according to their functionality with a notice (C) or (C++). If there is no such notice, the routine either applies to both or might not provided by the C++ API.
 
 To enable the C++ API, a compiler directive ``OPS_CPP_API`` is required.
 
@@ -474,7 +474,7 @@ used to generate initial geometry.
 
 The final ingredient is the stencil specification, for which we have two versions: simple and strided.
 
-#### ops_decl_stencil
+#### ops_decl_stencil (C)
 
 __ops_stencil ops_decl_stencil(int dims,int points, int *stencil, char *name)__
 
@@ -485,7 +485,10 @@ __ops_stencil ops_decl_stencil(int dims,int points, int *stencil, char *name)__
 |stencil|  stencil for accessing data|
 |name| string representing the name of the stencil|
 
-#### ops_decl_strided_stencil
+#### OPS_instance::decl_stencil (C++)
+
+The method accepts same arguments with its C counterpart.
+#### ops_decl_strided_stencil (C)
 
 __ops_stencil ops_decl_strided_stencil(int dims, int points, int *stencil, int *stride, char *name)__
 
@@ -497,6 +500,10 @@ __ops_stencil ops_decl_strided_stencil(int dims, int points, int *stencil, int *
 |stride|     stride for accessing data|
 |name| string representing the name of the stencil|
 
+#### OPS_instance::decl_strided_stencil (C++)
+
+The method accepts same arguments with its C counterpart.
+
 #### ops_decl_stencil_hdf5
 
 __ops_stencil ops_decl_stencil_hdf5(int dims,int points, char *name, char* file)__
@@ -537,10 +544,7 @@ dimension applications (with a stride of 0 for the relevant dimensions).
 
 ### Checkpointing
 
-OPS supports the automatic checkpointing of applications. Using the API below, the user specifies the file name for the
-checkpoint and an average time interval between checkpoints, OPS will then automatically save all necessary information
-periodically that is required to fast-forward to the last checkpoint if a crash occurred. Currently, when re-launching
-after a crash, the same number of MPI processes have to be used. To enable checkpointing mode, the *OPS_CHECKPOINT* runtime argument has to be used.
+OPS supports the automatic checkpointing of applications. Using the API below, the user specifies the file name for the checkpoint and an average time interval between checkpoints, OPS will then automatically save all necessary information periodically that is required to fast-forward to the last checkpoint if a crash occurred. Currently, when re-launching after a crash, the same number of MPI processes have to be used. To enable checkpointing mode, the *OPS_CHECKPOINT* runtime argument has to be used. (**Do we also need to define the CHECKPOINTING compiler directive?**)
 
 #### ops_checkpointing_init
 
@@ -559,8 +563,7 @@ mode, false otherwise.
 
 * OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually controls the location of the checkpoint, and explicitly specifies the list of *ops_dat*s to be saved.
 
-* OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the location of the checkpoint, and it also enables fast-forwarding, by skipping the execution of the
-application (even though none of the parallel loops would actually execute, there may be significant work outside of those) up to the checkpoint
+* OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the location of the checkpoint, and it also enables fast-forwarding, by skipping the execution of the application (even though none of the parallel loops would actually execute, there may be significant work outside of those) up to the checkpoint
 
 * OPS_CHECKPOINT_MANUAL - Indicates that when the corresponding API function is called, the checkpoint should be created. Assumes the presence of the above two options as well.
 
@@ -570,8 +573,7 @@ __void ops_checkpointing_manual_datlist(int ndats, ops_dat *datlist)__
 
 A user can call this routine at a point in the code to mark the location of a checkpoint.  At this point, the list of datasets specified
 will be saved. The validity of what is saved is not checked by the checkpointing algorithm assuming that the user knows
-what data sets to be saved for full recovery. This routine should be called frequently (compared to check-pointing
-frequency) and it will trigger the creation of the checkpoint the first time it is called after the timeout occurs.
+what data sets to be saved for full recovery. This routine should be called frequently (compared to check-pointing frequency) and it will trigger the creation of the checkpoint the first time it is called after the timeout occurs.
 
 | Arguments      | Description |
 | ----------- | ----------- |
@@ -630,7 +632,7 @@ in a tech-report on checkpointing, to be published later.
 
 This section describes APIs that give the user access to internal data structures in OPS and return data to user-space. These should be used cautiously and sparsely, as they can affect performance significantly
 
-#### ops_dat_get_local_npartitions
+#### ops_dat_get_local_npartitions (C)
 
 __int ops_dat_get_local_npartitions(ops_dat dat)__
 
@@ -640,7 +642,9 @@ This routine returns the number of chunks of the given dataset held by the curre
 | ----------- | ----------- |
 |dat|         the dataset|
 
-#### ops_dat_get_global_npartitions}
+#### ops_dat_core::get_local_npartitions (C++)
+The C++ version of ``ops_dat_get_local_npartitions``, which does not require input.
+#### ops_dat_get_global_npartitions (C)
 
 __int ops_dat_get_global_npartitions(ops_dat dat)__
 
@@ -650,7 +654,9 @@ This routine returns the number of chunks of the given dataset held by all proce
 | ----------- | ----------- |
 |dat|         the dataset
 
-#### ops_dat_get_extents
+#### ops_dat_core::get_global_npartitions (C++)
+The C++ version of ``ops_dat_get_global_npartitions``, which does not require input.
+#### ops_dat_get_extents (C)
 
 __void ops_dat_get_extents(ops_dat dat, int part, int *disp, int *sizes)__
 
@@ -663,7 +669,10 @@ This routine returns the MPI displacement and size of a given chunk of the given
 |disp|        an array populated with the displacement of the chunk within the ``global'' distributed array|
 |sizes|       an array populated with the spatial extents|
 
-#### ops_dat_get_raw_metadata
+#### ops_dat_core::get_extents (C++)
+The C++ version of ``ops_dat_get_extents`` where the arguments are the same except no need of the ops_dat arguments.
+
+#### ops_dat_get_raw_metadata (C)
 
 __char* ops_dat_get_raw_metadata(ops_dat dat, int part, int *disp, int *size, int *stride, int *d_m, int *d_p)__
 
@@ -679,7 +688,9 @@ This routine returns array shape metadata corresponding to the ops_dat. Any of t
 |d_m|      an array populated with padding on the left in each dimension. Note that these are negative values|
 |d_p|      an array populated with padding on the right in each dimension|
 
-#### ops_dat_get_raw_pointer
+#### ops_dat_core::get_raw_metadata (C++)
+The C++ version of ``ops_dat_get_raw_metadata`` where the arguments are the same except no need of the ops_dat arguments.
+#### ops_dat_get_raw_pointer (C)
 
 __char* ops_dat_get_raw_pointer(ops_dat dat, int part, ops_stencil stencil, ops_memspace *memspace)__
 
@@ -692,7 +703,9 @@ This routine returns a pointer to the internally stored data, with MPI halo regi
 |stencil|     a stencil used to determine required MPI halo exchange depths|
 |memspace|       when set to OPS_HOST or OPS_DEVICE, returns a pointer to data in that memory space, otherwise must be set to 0, and returns whether data is in the host or on the device|
 
-#### ops_dat_release_raw_data
+#### ops_dat_core::get_raw__pointer (C++)
+The C++ version of ``ops_dat_get_raw_pointer`` where the arguments are the same except no need of the ops_dat arguments.
+#### ops_dat_release_raw_data (C)
 
 __void ops_dat_release_raw_data(ops_dat dat, int part, ops_access acc)__
 
@@ -706,34 +719,35 @@ A single call to ops_dat_release_raw_data() releases all pointers obtained by pr
 |part|        the chunk index (has to be 0)|
 |acc|     the kind of access that was used by the user (OPS_READ if it was read only, OPS_WRITE if it was overwritten, OPS_RW if it was read and written)|
 
-#### ops_dat_release_raw_data
+#### ops_dat_core::_release_raw_data (C++)
+The C++ version of ``ops_dat_release_raw_data`` where the arguments are the same except no need of the ops_dat arguments.
+#### ops_dat_fetch_data (C)
 
-__void ops_dat_release_raw_data_memspace(ops_dat dat, int part, ops_access acc, ops_memspace *memspace)__
-
-Indicates to OPS that a dataset previously accessed with ops_dat_get_raw_pointer is released by the user, and also tells OPS how it was accessed, and which memory space was used.
+__void ops_dat_fetch_data(ops_dat dat, int part, int *data)__
 
-A single call to ops_dat_release_raw_data() releases all pointers obtained by previous calls to ops_dat_get_raw_pointer() calls on the same dat and with the same *memspace argument, i.e. calls do not nest.
+This routine copies the data held by OPS to the user-specified memory location, which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
 
 | Arguments      | Description |
 | ----------- | ----------- |
 |dat|         the dataset|
-|part|        the chunk index (has to be 0)|
-|acc|     the kind of access that was used by the user (OPS_READ if it was read only, OPS_WRITE if it was overwritten, OPS_RW if it was read and written)|
-|memspace|       set to OPS_HOST or OPS_DEVICE |
+|part|        the chunk index (has to be 0) |
+|data|        pointer to memory which should be filled by OPS|
 
-#### ops_dat_fetch_data
+#### ops_dat_fetch_data_memspace (C)
 
-__void ops_dat_fetch_data(ops_dat dat, int part, int *data)__
+__void ops_dat_fetch_data_memspace(ops_dat dat, int part, char *data, ops_memspace memspace)__
 
-This routine copies the data held by OPS to the user-specified memory location, which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
+This routine copies the data held by OPS to the user-specified memory location, as which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
 
 | Arguments      | Description |
 | ----------- | ----------- |
 |dat|         the dataset|
 |part|        the chunk index (has to be 0) |
 |data|        pointer to memory which should be filled by OPS|
-
-#### ops_dat_set_data
+| memspace |the memory space where the data pointer is|
+#### ops_dat_core::fetch_data (C++)
+The C++ version of ``ops_dat_fetch_data_memspace`` where the arguments the same except no need of the ops_dat arguments.
+#### ops_dat_set_data (C)
 
 __void ops_dat_set_data(ops_dat dat, int part, int *data)__
 
@@ -745,7 +759,25 @@ This routine copies the data given  by the user to the internal data structure u
 |part|        the chunk index (has to be 0)|
 |data|        pointer to memory which should be copied to OPS |
 
-### Tridsolver Calls
+
+#### ops_dat_set_data_memspace (C)
+
+__void ops_dat_set_data_memspace(ops_dat dat, int part, char *data, ops_memspace memspace)__
+
+This routine copies the data given  by the user to the internal data structure used by OPS. User data needs to be laid out in column-major order and strided as indicated by the sizes parameter of ops_dat_get_extents.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|data|        pointer to memory which should be copied to OPS |
+|memspace| the memory space where the data pointer is|
+
+#### ops_dat_core::set_data (C++)
+The C++ version of ``ops_dat_set_data_memspace`` where the arguments the same except no need of the ops_dat arguments.
+### Linear algebra solvers
+
+####  Tridiagonal solver
 This section specifies APIs that allow [Tridsolver](https://github.com/OP-DSL/tridsolver) (a tridiagonal solver library) to be called from OPS. The library can be used to solve a large number of tridiagonal systems of equations stored in multidimensional datasets. Parameters that are passed to Tridsolver from OPS are stored in an `ops_tridsolver_params` object. The constructor for this class takes the `ops_block` that the datasets are defined over as an argument and optionally also a solving strategy to use (only relevant to MPI applications). The following solving strategies are available (see Tridsolver for more details about these):
 
 - GATHER_SCATTER (not available for GPUs)
@@ -757,7 +789,7 @@ This section specifies APIs that allow [Tridsolver](https://github.com/OP-DSL/tr
 
 Then parameters specific to different solving strategies can be set using setter methods. For applications using MPI, it is beneficial to reuse `ops_tridsolver_params` objects between solves as much as possible due to set up times involved with creating Tridsolver's MPI communicators.
 
-#### ops_tridMultiDimBatch
+##### ops_tridMultiDimBatch
 
 __void ops_tridMultiDimBatch(int ndim, int solvedim, int* dims, ops_dat a, ops_dat b, ops_dat c, ops_dat d, ops_tridsolver_params *tridsolver_ctx)__
 
@@ -774,7 +806,7 @@ This solves multiple tridiagonal systems of equations in multidimensional datase
 |d| the dataset for the right hand side, also where the solution is written to |
 |tridsolver_ctx| an object containing the parameters for the Tridsolver library |
 
-#### ops_tridMultiDimBatch_Inc
+##### ops_tridMultiDimBatch_Inc
 
 __void ops_tridMultiDimBatch(int ndim, int solvedim, int* dims, ops_dat a, ops_dat b, ops_dat c, ops_dat d, ops_dat u, ops_tridsolver_params *tridsolver_ctx)__
 

From a66808c5cfebb9e098056e376f420afced48eac0 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 5 Nov 2021 08:08:41 +0000
Subject: [PATCH 160/324] Update opsapi.md

---
 doc/opsapi.md | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 4ab720fceb..e56d336ab9 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -828,12 +828,24 @@ This solves multiple tridiagonal systems of equations in multidimensional datase
 
 The following is a list of all the runtime flags and options that can be used when executing OPS generated applications.
 ### General flags
-* `OPS_DIAGS=`
-* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`
-* `-gpudirect`
-* `OPS_CL_DEVICE=`
-* `OPS_TILING`
-* `OPS_TILING_MAXDEPTH=`
+* `OPS_DIAGS=` : set OPS diagnostics level at runtime. 
+
+  `OPS_DIAGS=1` - no diagnostics, default level to achieve the best runtime performance.
+   
+  `OPS_DIAGS>1` - print block decomposition and `ops_par_loop` timing breakdown.
+  
+  `OPS_DIAGS>4` - print intra-block halo buffer allocation feedback (for OPS internal development only).
+  
+  `OPS_DIAGS>5` - check if intra-block halo MPI sends depth match MPI receives depth (for OPS internal development only).  
+  
+* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=` : The CUDA (and OpenCL) thread block sizes in X, Y and Z dimensions. The sizes should be an integer between 1 - 1024, and currently they should be selected such that `OPS_BLOCK_SIZE_X`*`OPS_BLOCK_SIZE_Y`*`OPS_BLOCK_SIZE_Z`< 1024
+
+* `-gpudirect` : Enable GPU direct support when executing MPI+CUDA executables. 
+
+* `OPS_CL_DEVICE=` : Select the OpenCL device for execution. Usually `OPS_CL_DEVICE=0` selects the CPU and `OPS_CL_DEVICE=1` selects GPUs. The selected device will be reported by OPS during execution.
+
+* `OPS_TILING` : Execute OpenMP code with cache blocking tiling. See the [Performance Tuning](https://github.com/OP-DSL/OPS/blob/MarkdownDocDev/doc/perf.md) section. 
+* `OPS_TILING_MAXDEPTH=` : Execute MPI+OpenMP code with cache blocking tiling and further communication avoidance. See the [Performance Tuning](https://github.com/OP-DSL/OPS/blob/MarkdownDocDev/doc/perf.md) section. 
 
 ## Doxygen
 Doxygen generated from OPS source can be found [here](https://op-dsl-ci.gitlab.io/ops-ci/).

From 6ff98334f8957ed87c2b461250c397cc4e978604 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 5 Nov 2021 08:09:34 +0000
Subject: [PATCH 161/324] Update opsapi.md

---
 doc/opsapi.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index e56d336ab9..1562380e12 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -827,7 +827,7 @@ This solves multiple tridiagonal systems of equations in multidimensional datase
 ## Runtime Flags and Options
 
 The following is a list of all the runtime flags and options that can be used when executing OPS generated applications.
-### General flags
+
 * `OPS_DIAGS=` : set OPS diagnostics level at runtime. 
 
   `OPS_DIAGS=1` - no diagnostics, default level to achieve the best runtime performance.

From c4d97450390ba6dac4333e3f10456a1c960bada5 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Fri, 24 Sep 2021 20:59:00 +0100
Subject: [PATCH 162/324] Disable user option in makefile

---
 doc/Makefile | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/doc/Makefile b/doc/Makefile
index 3a6807783a..7e68d6ad2c 100755
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -8,28 +8,14 @@
 
 .PHONY : all user doxygen clean distclean
 
-all : user doxygen
-
-user :
-	pdflatex --shell-escape user.tex
-	pdflatex --shell-escape user.tex
-	-bibtex user
-	pdflatex --shell-escape user.tex
-	latex_count=8 ; \
-	while egrep -s 'Rerun (LaTeX|to get cross-references right)' user.log && [ $$latex_count -gt 0 ] ;\
-	    do \
-	      echo "Rerunning latex...." ;\
-	      pdflatex --shell-escape user.tex ;\
-	      latex_count=`expr $$latex_count - 1` ;\
-	    done
-
+all : doxygen
 doxygen :
 	doxygen ops/Doxyfile
 	cd ops/latex; make refman.pdf
 	doxygen ops_translator/Doxyfile
 	cd ops_translator/latex; make refman.pdf
 
-clean : 
+clean :
 	-rm -f *.out *.aux *.blg *.pyg.* *.log *.backup *.toc *~ *.bbl
 	-rm -rf _minted-user
 

From a2abf131a53a655484a464a1f9679728b668a999 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Fri, 24 Sep 2021 21:25:50 +0100
Subject: [PATCH 163/324] Create documentation framework

---
 .readthedocs.yml    |  13 +
 doc/conf.py         |  60 ++++
 doc/index.rst       |  24 ++
 doc/installation.md |  94 +++++
 doc/introduction.md |  50 +++
 doc/keyconcept.md   | 102 ++++++
 doc/quickstart.md   |   3 +
 doc/requirement.txt |   2 +
 doc/user.md         | 839 ++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 1187 insertions(+)
 create mode 100644 .readthedocs.yml
 create mode 100644 doc/conf.py
 create mode 100644 doc/index.rst
 create mode 100644 doc/installation.md
 create mode 100644 doc/introduction.md
 create mode 100644 doc/keyconcept.md
 create mode 100644 doc/quickstart.md
 create mode 100644 doc/requirement.txt
 create mode 100644 doc/user.md

diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 0000000000..ceca0e737d
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,13 @@
+# File: .readthedocs.yaml
+
+version: 2
+
+# Build from the docs/ directory with Sphinx
+sphinx:
+  configuration: doc/conf.py
+
+# Explicitly set the version of Python and its requirements
+python:
+  version: 3.8
+  install:
+    - requirements: doc/requirement.txt
\ No newline at end of file
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000000..8be05822d8
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,60 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'Oxford Parallel library for Structured mesh solvers'
+copyright = 'Copyright (c) 2013, Mike Giles and others'
+author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
+
+# The full version, including alpha/beta/rc tags
+release = 'latest'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.mathjax',
+    'sphinx.ext.ifconfig',
+    'myst_parser'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+
+source_suffix = ['.rst', '.md']
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme" #'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
\ No newline at end of file
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000000..d4f72096a9
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,24 @@
+.. Test documentation master file, created by
+   sphinx-quickstart on Thu Sep 23 09:45:16 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to Test's documentation!
+================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   introduction.md
+   keyconcept.md
+   installation.md
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/doc/installation.md b/doc/installation.md
new file mode 100644
index 0000000000..c46de11093
--- /dev/null
+++ b/doc/installation.md
@@ -0,0 +1,94 @@
+# Installation
+
+**Note: The current CMakefile and relevant instructions are mainly tested on linux-based systems including Windows Subsystem for Linux**
+
+## Dependencies
+
+  * CMake
+
+  CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
+  ```bash
+  version=3.19.0
+  wget https://github.com/Kitware/CMake/releases/download/v$version/cmake-$version-Linux-x86_64.sh
+  # Assume that CMake is going to be installed at /usr/local/cmake
+  cmake_dir=/usr/local/cmake
+  # sudo is not necessary for directories in user space.
+  sudo mkdir $cmake_dir
+  sudo sh ./cmake-$version-Linux-x86_64.sh --prefix=$cmake_dir  --skip-license
+  sudo ln -s $cmake_dir/bin/cmake /usr/local/bin/cmake
+  ```
+
+  * Python2
+
+  **Python2** is required by the OPS Python translator. The CMake build system will try to identify it automatically. However, the process can fail sometime (e.g., if there are both Python2 and Python3 installed). If this happens, the path to Python2 can be specified manually by using **-DPython2_EXECUTABLE** when invoking CMake
+
+  * HDF5
+
+  [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
+
+  * CUDA
+
+  The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
+
+<!-- 1. Set up environmental variables:
+
+  * `CUDA_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications, only needed if CUDA cannot be found in standard locations, or to enable OpenCL)
+  * `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
+  * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
+
+
+## Build OPS back-end libraries example applications
+### Build the library and example applications together
+
+  Create a build directory, and run CMake (version 3.18 or newer)
+  ```bash
+  mkdir build
+  cd build
+  # Please see below for CMake options
+  cmake ${PATH_TO_OPS} -DBUILD_OPS_APPS=ON -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL -DGPU_NUMBER=1
+  make # IEEE=1 this option is important for applications to get accurate results
+  make install # sudo is needed if a directory like /usr/local/ is chosen.
+  ```
+After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
+
+###  Build the library and example applications separately
+
+In this mode, the library can be firstly built and installed as
+
+```bash
+  mkdir build
+  cd build
+  # Please see below for CMake options
+  cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
+  make # IEEE=1 this option is important for applications to get accurate results
+  make install # sudo is needed if a system direction is chosen,
+  ```
+then the application can be built as
+
+```bash
+  mkdir appbuild
+  cd appbuild
+  # Please see below for CMake options
+  cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
+  make # IEEE=1 this option is important for applications to get accurate results
+  ```
+### Tests
+
+A few tasks for testing codes can be run by
+```bash
+  make test
+  ```
+The current tests are mainly based on the applications.
+### Options of interest to specify to `cmake` include:
+
+  * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
+  * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
+  * `-DOPS_TEST=ON` - enable the tests
+  * `-DCMAKE_INSTALL_PREFIX=` - specify the installation direction for the library (/usr/local by default, Library CMake only)
+  * `-DAPP_INSTALL_DIR=` - specify the installation direction for the applications ($HOME/OPS-APPS by default)
+  * `-DGPU_NUMBER=` - specify the number of GPUs used in the tests
+  * `-DOPS_INSTALL_DIR=` - specify where the OPS library is installed (Application CMake only, see [here](#build-the-library-and-example-applications-separately))
+  * `-DOPS_VERBOSE_WARNING=ON` - show verbose output during building process
+  <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
+  <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
+
diff --git a/doc/introduction.md b/doc/introduction.md
new file mode 100644
index 0000000000..515fcee001
--- /dev/null
+++ b/doc/introduction.md
@@ -0,0 +1,50 @@
+# Introduction
+
+OPS is a high-level framework with associated libraries and
+preprocessors to generate parallel executables for applications on
+**multi-block structured grids**. Multi-block structured grids consists
+of an unstructured collection of structured meshes/grids. This document
+describes the OPS C++ API, which supports the development of
+single-block and multi-block structured meshes.
+
+Many of the API and library follows the structure of the OP2 high-level
+library for unstructured mesh applications [@op2]. However the
+structured mesh domain is distinct from the unstructured mesh
+applications domain due to the implicit connectivity between
+neighbouring mesh elements (such as vertices, cells) in structured
+meshes/grids. The key idea is that operations involve looping over a
+"rectangular" multi-dimensional set of grid points using one or more
+"stencils" to access data. In multi-block grids, we have several
+structured blocks. The connectivity between the faces of different
+blocks can be quite complex, and in particular they may not be oriented
+in the same way, i.e. an $i,j$ face of one block may correspond to the
+$j,k$ face of another block. This is awkward and hard to handle simply.
+
+To clarify some of the important issues in designing the API, we note
+here some needs connected with a 3D application:
+
+-   When looping over the interior with loop indices $i,j,k$, often
+    there are 1D arrays which are referenced using just one of the
+    indices.
+
+-   To implement boundary conditions, we often loop over a 2D face,
+    accessing both the 3D dataset and data from a 2D dataset.
+
+-   To implement periodic boundary conditions using dummy "halo" points,
+    we sometimes have to copy one plane of boundary data to another.
+    e.g. if the first dimension has size $I$ then we might copy the
+    plane $i=I\!-\!2$ to plane $i=0$, and plane $i=1$ to plane
+    $i=I\!-\!1$.
+
+-   In multigrid, we are working with two grids with one having twice as
+    many points as the other in each direction. To handle this we
+    require a stencil with a non-unit stride.
+
+-   In multi-block grids, we have several structured blocks. The
+    connectivity between the faces of different blocks can be quite
+    complex, and in particular they may not be oriented in the same way,
+    i.e. an $i,j$ face of one block may correspond to the $j,k$ face of
+    another block. This is awkward and hard to handle simply.
+
+The latest proposal is to handle all of these different requirements
+through stencil definitions.
\ No newline at end of file
diff --git a/doc/keyconcept.md b/doc/keyconcept.md
new file mode 100644
index 0000000000..bd26fc25b5
--- /dev/null
+++ b/doc/keyconcept.md
@@ -0,0 +1,102 @@
+# Key concepts and structure
+
+An OPS application can generally be divided into two key parts:
+initialisation and parallel execution. During the initialisation phase,
+one or more blocks (ops_block) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a
+block, and have a specific size (in each dimension of the block), which
+may be slightly different across different datasets (e.g. staggered
+grids), in some directions they may be degenerate (a size of 1), or they
+can represent data associated with different multigrid levels (where
+their size if a multiple or a fraction of other datasets). Datasets can
+be declared with empty (NULL) pointers, then OPS will allocate the
+appropriate amount of memory, may be passed non-NULL pointers (currently
+only supported in non-MPI environments), in which case OPS will assume
+the memory is large enough for the data and the block halo, and there
+are HDF5 dataset declaration routines which allow the distributed
+reading of datasets from HDF5 files. The concept of blocks is necessary
+to group datasets together, as in a multi-block problem, in a
+distributed memory environment, OPS needs to be able to determine how to
+decompose the problem.
+
+The initialisation phase usually also consists of defining the stencils
+to be used later on (though they can be defined later as well), which
+describe the data access patterns used in parallel loops. Stencils are
+always relative to the "current" point; e.g. if at iteration $(i,j)$, we
+wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two
+points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in
+one of the dimensions the dataset's size is 1), as well as for
+multigrid, there are special strided, restriction, and prolongation
+stencils: they differ from normal stencils in that as one steps through
+a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a
+degenerate dataset called xcoords, size $(N,1)$, then we will need a
+stencil with stride $(1,0)$ to access it in a regular 2D loop.
+
+Finally, the initialisation phase may declare a number of global
+constants - these are variables in global scope that can be accessed
+from within user kernels, without having to pass them in explicitly.
+These may be scalars or small arrays, generally for values that do not
+change during execution, though they may be updated during execution
+with repeated calls to `ops_decl_const`.
+
+The initialisation phase is terminated by a call to `ops_partition`.
+
+The bulk of the application consists of parallel loops, implemented
+using calls to `ops_par_loop`. These constructs work with datasets,
+passed through the opaque `ops_dat` handles declared during the
+initialisation phase. The iterations of parallel loops are semantically
+independent, and it is the responsibility of the user to enforce this:
+the order in which iterations are executed cannot affect the result
+(within the limits of floating point precision). Parallel loops are
+defined on a block, with a prescribed iteration range that is always
+defined from the perspective of the dataset written/modified (the sizes
+of datasets, particularly in multigrid situations, may be very
+different). Datasets are passed in using `ops_arg_dat`, and during
+execution, values at the current grid point will be passed to the user
+kernel. These values are passed wrapped in a templated `ACC<>` object
+(templated on the type of the data), whose parentheses operator is
+overloaded, which the user must use to specify the relative offset to
+access the grid point's neighbours (which accesses have to match the the
+declared stencil). Datasets written may only be accessed with a
+one-point, zero-offset stencil (otherwise the parallel semantics may be
+violated).
+
+Other than datasets, one can pass in read-only scalars or small arrays
+that are iteration space invariant with `ops_arg_gbl` (typically
+weights, $\delta t$, etc. which may be different in different loops).
+The current iteration index can also be passed in with `ops_arg_idx`,
+which will pass a globally consistent index to the user kernel (i.e.
+also under MPI).
+
+Reductions in loops are done using the ops_arg_reduce argument, which
+takes a reduction handle as an argument. The result of the reduction can
+then be acquired using a separate call to `ops_reduction_result`. The
+semantics are the following: a reduction handle after it was declared is
+in an "uninitialised" state. The first time it is used as an argument to
+a loop, its type is determined (increment/min/max), and is initialised
+appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in
+parallel loops are combined together, up until the point, where the
+result is acquired using `ops_reduction_result`, which then sets it back
+to an uninitialised state. This also implies, that different parallel
+loops, which all use the same reduction handle, but are otherwise
+independent, are independent and their partial reduction results can be
+combined together associatively and commutatively.
+
+OPS takes responsibility for all data, its movement and the execution of
+parallel loops. With different execution hardware and optimisations,
+this means OPS will re-organise data as well as execution (potentially
+across different loops), and therefore any data accesses or manipulation
+may only be done through the OPS API.
+
+This restriction is exploited by a lazy execution mechanism in OPS. The
+idea is that OPS API calls that do not return a result can be not
+executed immediately, rather queued, and once an API call requires
+returning some data, operations in the queue are executed, and the
+result is returned. This allows OPS to analyse and optimise operations
+in the queue together. This mechanism is fully automated by OPS, and is
+used with the various \_tiled executables. For more information on how
+to use this mechanism for improving CPU performance, see Section
+[\[sec:tiling\]](#sec:tiling){reference-type="ref"
+reference="sec:tiling"}. Some API calls triggering the execution of
+queued operations include ops_reduction_result, and the functions in the
+data access API.
\ No newline at end of file
diff --git a/doc/quickstart.md b/doc/quickstart.md
new file mode 100644
index 0000000000..93813d43d9
--- /dev/null
+++ b/doc/quickstart.md
@@ -0,0 +1,3 @@
+# Quick start
+## How to use math
+$$\alpha$$
\ No newline at end of file
diff --git a/doc/requirement.txt b/doc/requirement.txt
new file mode 100644
index 0000000000..9af8e80a61
--- /dev/null
+++ b/doc/requirement.txt
@@ -0,0 +1,2 @@
+ # We set the tools needed by sphinx
+ myst-parser ==  0.15.2
diff --git a/doc/user.md b/doc/user.md
new file mode 100644
index 0000000000..d3ebca8478
--- /dev/null
+++ b/doc/user.md
@@ -0,0 +1,839 @@
+---
+author:
+- Mike Giles, Istvan Reguly, Gihan Mudalige
+date: May 2019
+title: OPS C++ User's Manual
+---
+
+
+
+
+
+# OPS C++ API
+
+## Initialisation declaration and termination routines
+
+###  {#section .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the usual command line arguments
+
+an integer which defines the level of debugging diagnostics and
+reporting to be performed
+:::
+
+Currently, higher `diags_level`s does the following checks\
+`diags_level` $=$ 1 : no diagnostics, default to achieve best runtime
+performance.\
+`diags_level` $>$ 1 : print block decomposition and `ops_par_loop`
+timing breakdown.\
+`diags_level` $>$ 4 : print intra-block halo buffer allocation feedback
+(for OPS internal development only)\
+`diags_level` $>$ 5 : check if intra-block halo MPI sends depth match
+MPI receives depth (for OPS internal development only)\
+
+###  {#section-1 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dimension of the block
+
+a name used for output diagnostics
+:::
+
+###  {#section-2 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dimension of the block
+
+a name used for output diagnostics
+
+hdf5 file to read and obtain the block information from
+:::
+
+Although this routine does not read in any extra information about the
+block from the named HDF5 file than what is already specified in the
+arguments, it is included here for error checking (e.g. check if blocks
+defined in an HDF5 file is matching with the declared arguments in an
+application) and completeness.\
+
+###  {#section-3 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+structured block
+
+dimension of dataset (number of items per grid element)
+
+size in each dimension of the block
+
+base indices in each dimension of the block
+
+padding from the face in the negative direction for each dimension (used
+for block halo)
+
+padding from the face in the positive direction for each dimension (used
+for block halo)
+
+input data of type `T`
+
+the name of type used for output diagnostics (e.g. "double", "float")
+
+a name used for output diagnostics
+:::
+
+The `size` allows to declare different sized data arrays on a given
+`block`. `d_m` and `d_p` are depth of the "block halos" that are used to
+indicate the offset from the edge of a block (in both the negative and
+positive directions of each dimension).\
+\
+
+###  {#section-4 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+structured block
+
+dimension of dataset (number of items per grid element)
+
+the name of type used for output diagnostics (e.g. "double", "float")
+
+name of the dat used for output diagnostics
+
+hdf5 file to read and obtain the data from
+:::
+
+###  {#section-5 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+a name used to identify the constant
+
+dimension of dataset (number of items per element)
+
+the name of type used for output diagnostics (e.g. "double", "float")
+
+pointer to input data of type `T`
+:::
+
+###  {#section-6 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+a name used to identify the constant
+
+dimension of dataset (number of items per element)
+
+the name of type used for output diagnostics (e.g. "double", "float")
+
+pointer to new values for constant of type `T`
+:::
+
+###  {#section-7 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+origin dataset
+
+destination dataset
+
+defines an iteration size (number of indices to iterate over in each
+direction)
+
+indices of starting point in \"from\" dataset
+
+indices of starting point in \"to\" dataset
+
+direction of incrementing for \"from\" for each dimension of `iter_size`
+
+direction of incrementing for \"to\" for each dimension of `iter_size`
+:::
+
+A from_dir \[1,2\] and a to_dir \[2,1\] means that x in the first block
+goes to y in the second block, and y in first block goes to x in second
+block. A negative sign indicates that the axis is flipped. (Simple
+example: a transfer from (1:2,0:99,0:99) to (-1:0,0:99,0:99) would use
+iter_size = \[2,100,100\], from_base = \[1,0,0\], to_base = \[-1,0,0\],
+from_dir = \[0,1,2\], to_dir = \[0,1,2\]. In more complex case this
+allows for transfers between blocks with different orientations.)\
+
+###  {#section-8 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+origin dataset
+
+destination dataset
+
+hdf5 file to read and obtain the data from
+:::
+
+###  {#section-9 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+number of halos in `halos`
+
+array of halos
+:::
+
+###  {#section-10 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+size of data in bytes
+
+the name of type used for output diagnostics (e.g. "double", "float")
+
+name of the dat used for output diagnostics
+:::
+
+::: list
+plus 1pt minus 1pt
+
+the `ops_reduction` handle
+
+a pointer to write the results to, memory size has to match the declared
+:::
+
+###  {#section-11 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+string describing the partitioning method. Currently this string is not
+used internally, but is simply a place-holder to indicate different
+partitioning methods in the future.
+:::
+
+###  {#section-12 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+:::
+
+## Diagnostics and output routines
+
+###  {#section-13 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+:::
+
+###  {#section-14 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+:::
+
+###  {#section-15 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+variable to hold the CPU time at the time of invocation
+
+variable to hold the elapsed time at the time of invocation
+:::
+
+###  {#section-16 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+ops_block to be written
+
+hdf5 file to write to
+:::
+
+###  {#section-17 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+ops_stencil to be written
+
+hdf5 file to write to
+:::
+
+###  {#section-18 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+ops_dat to be written
+
+hdf5 file to write to
+:::
+
+###  {#section-19 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+ops_dat to to be written
+
+text file to write to
+:::
+
+###  {#section-20 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+output stream, use stdout to print to standard out
+:::
+
+###  {#section-21 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+ops_dat to to be checked
+:::
+
+## Halo exchange
+
+###  {#section-22 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the halo group
+:::
+
+## Parallel loop syntax
+
+A parallel loop with N arguments has the following syntax:
+
+###  {#section-23 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+user's kernel function with N arguments
+
+name of kernel function, used for output diagnostics
+
+the ops_block over which this loop executes
+
+dimension of loop iteration
+
+iteration range array
+
+arguments
+:::
+
+The **ops_arg** arguments in **ops_par_loop** are provided by one of the
+following routines, one for global constants and reductions, and the
+other for OPS datasets.
+
+###  {#section-24 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+data array
+
+array dimension
+
+string representing the type of data held in data
+
+access type
+:::
+
+###  {#section-25 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+an `ops_reduction` handle
+
+array dimension (according to `type`)
+
+string representing the type of data held in data
+
+access type
+:::
+
+###  {#section-26 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dataset
+
+stencil for accessing data
+
+string representing the type of data held in dataset
+
+access type
+:::
+
+###  {#section-27 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+:::
+
+## Stencils
+
+The final ingredient is the stencil specification, for which we have two
+versions: simple and strided.\
+
+###  {#section-28 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dimension of loop iteration
+
+number of points in the stencil
+
+stencil for accessing data
+
+string representing the name of the stencil
+:::
+
+###  {#section-29 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dimension of loop iteration
+
+number of points in the stencil
+
+stencil for accessing data
+
+stride for accessing data
+
+string representing the name of the stencil\
+:::
+
+###  {#section-30 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+dimension of loop iteration
+
+number of points in the stencil
+
+string representing the name of the stencil
+
+hdf5 file to write to
+:::
+
+In the strided case, the semantics for the index of data to be accessed,
+for stencil point `p`, in dimension `m` are defined as:\
+,\
+where `loop_index[m]` is the iteration index (within the user-defined
+iteration space) in the different dimensions.
+
+If, for one or more dimensions, both `stride[m]` and `stencil[p*dims+m]`
+are zero, then one of the following must be true;
+
+-   the dataset being referenced has size 1 for these dimensions
+
+-   these dimensions are to be omitted and so the dataset has dimension
+    equal to the number of remaining dimensions.
+
+See `OPS/apps/c/CloverLeaf/build_field.cpp` and
+`OPS/apps/c/CloverLeaf/generate.cpp` for an example
+`ops_decl_strided_stencil` declaration and its use in a loop,
+respectively.\
+These two stencil definitions probably take care of all of the cases in
+the Introduction except for multiblock applications with interfaces with
+different orientations -- this will need a third, even more general,
+stencil specification. The strided stencil will handle both multigrid
+(with a stride of 2 for example) and the boundary condition and reduced
+dimension applications (with a stride of 0 for the relevant dimensions).
+
+## Checkpointing
+
+OPS supports the automatic checkpointing of applications. Using the API
+below, the user specifies the file name for the checkpoint and an
+average time interval between checkpoints, OPS will then automatically
+save all necessary information periodically that is required to
+fast-forward to the last checkpoint if a crash occurred. Currently, when
+re-launching after a crash, the same number of MPI processes have to be
+used. To enable checkpointing mode, the `OPS_CHECKPOINT` runtime
+argument has to be used.\
+
+###  {#section-31 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+name of the file for checkpointing. In MPI, this will automatically be
+post-fixed with the rank ID.
+
+average time (seconds) between checkpoints
+
+a combinations of flags, listed in `ops_checkpointing.h`:\
+OPS_CHECKPOINT_INITPHASE - indicates that there are a number of parallel
+loops at the very beginning of the simulations which should be excluded
+from any checkpoint; mainly because they initialise datasets that do not
+change during the main body of the execution. During restore mode these
+loops are executed as usual. An example would be the computation of the
+mesh geometry, which can be excluded from the checkpoint if it is
+re-computed when recovering and restoring a checkpoint. The API call
+void `ops_checkpointing_initphase_done()` indicates the end of this
+initial phase.
+
+OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually
+controls the location of the checkpoint, and explicitly specifies the
+list of `ops_dat`s to be saved.
+
+OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the
+location of the checkpoint, and it also enables fast-forwarding, by
+skipping the execution of the application (even though none of the
+parallel loops would actually execute, there may be significant work
+outside of those) up to the checkpoint.
+
+OPS_CHECKPOINT_MANUAL - Indicates that when the corresponding API
+function is called, the checkpoint should be created. Assumes the
+presence of the above two options as well.
+:::
+
+###  {#section-32 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+number of datasets to be saved
+
+arrays of `ops_dat` handles to be saved
+:::
+
+###  {#section-33 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+size of the payload in bytes
+
+pointer to memory into which the payload is packed
+:::
+
+###  {#section-34 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+number of datasets to be saved
+
+arrays of `ops_dat` handles to be saved
+
+size of the payload in bytes
+
+pointer to memory into which the payload is packed
+:::
+
+###  {#section-35 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+number of datasets to be saved
+
+arrays of `ops_dat` handles to be saved
+
+size of the payload in bytes
+
+pointer to memory into which the payload is packed
+:::
+
+The suggested use of these **manual** functions is of course when the
+optimal location for checkpointing is known - one of the ways to
+determine that is to use the built-in algorithm. More details of this
+will be reported in a tech-report on checkpointing, to be published
+later.
+
+## Access to OPS data
+
+This section describes APIS that give the user access to internal data
+structures in OPS and return data to user-space. These should be used
+cautiously and sparsely, as they can affect performance significantly
+
+###  {#section-36 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+:::
+
+###  {#section-37 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+:::
+
+###  {#section-38 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+an array populated with the displacement of the chunk within the
+"global" distributed array
+
+an array populated with the spatial extents
+:::
+
+###  {#section-39 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+an array populated with the displacement of the chunk within the
+"global" distributed array
+
+an array populated with the spatial extents
+
+an array populated strides in spatial dimensions needed for column-major
+indexing
+
+an array populated with padding on the left in each dimension. Note that
+these are negative values
+
+an array populated with padding on the right in each dimension
+:::
+
+###  {#section-40 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+a stencil used to determine required MPI halo exchange depths
+
+when set to OPS_HOST or OPS_DEVICE, returns a pointer to data in that
+memory space, otherwise must be set to 0, and returns whether data is in
+the host or on the device
+:::
+
+###  {#section-41 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+the kind of access that was used by the user (OPS_READ if it was read
+only, OPS_WRITE if it was overwritten, OPS_RW if it was read and
+written)
+:::
+
+###  {#section-42 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+the kind of access that was used by the user (OPS_READ if it was read
+only, OPS_WRITE if it was overwritten, OPS_RW if it was read and
+written)
+
+set to OPS_HOST or OPS_DEVICE
+:::
+
+###  {#section-43 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+pointer to memory which should be filled by OPS
+:::
+
+###  {#section-44 .unnumbered}
+
+::: list
+plus 1pt minus 1pt
+
+the dataset
+
+the chunk index (has to be 0)
+
+pointer to memory which should be copied to OPS
+:::
+
+# Tiling for Cache-blocking
+
+OPS has a code generation (ops_gen_mpi_lazy) and build target for
+tiling. Once compiled, to enable, use the `OPS_TILING` runtime parameter
+- this will look at the L3 cache size of your CPU and guess the correct
+tile size. If you want to alter the amount of cache to be used for the
+guess, use the `OPS_CACHE_SIZE=XX` runtime parameter, where the value is
+in Megabytes. To manually specify the tile sizes, use the
+OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments.
+
+When MPI is combined with OpenMP tiling can be extended to the MPI
+halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
+halos for multiple `ops_par_loops` can be exchanged with a single MPI
+message (see [@TPDS2017] for more details)\
+To test, compile CloverLeaf under `apps/c/CloverLeaf`, modify clover.in
+to use a $6144^2$ mesh, then run as follows:\
+For OpenMP with tiling:\
+`export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING`\
+For MPI+OpenMP with tiling:\
+`export OMP_NUM_THREADS=xx; mpirun -np xx ./cloverleaf_mpi_tiled OPS_TILING OPS_TILING_MAXDEPTH=6`\
+To manually specify the tile sizes (in number of grid points), use the
+OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:\
+`export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200 `
+
+# CUDA and OpenCL Runtime Arguments
+
+The CUDA (and OpenCL) thread block sizes can be controlled by setting
+the `OPS_BLOCK_SIZE_X, OPS_BLOCK_SIZE_Y` and `OPS_BLOCK_SIZE_Z` runtime
+arguments. For example :\
+`./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4`\
+`OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the
+code on.\
+Usually `OPS_CL_DEVICE=0` selects the CPU and `OPS_CL_DEVICE=1` selects
+GPUs.
+
+# Executing with GPUDirect
+
+GPU direct support for MPI+CUDA, to enable (on the OPS side) add
+**-gpudirect** when running the executable. You may also have to use
+certain environmental flags when using different MPI distributions. For
+an example of the required flags and environmental settings on the
+Cambridge Wilkes2 GPU cluster see:\
+<https://docs.hpc.cam.ac.uk/hpc/user-guide/performance-tips.html>
+
+# OPS User Kernels
+
+In OPS, the elemental operation carried out per mesh/grid point is
+specified as an outlined function called a *user kernel*. An example
+taken from the Cloverleaf application is given in Figure
+[\[fig:example\]](#fig:example){reference-type="ref"
+reference="fig:example"}.\
+
+``` {.cpp mathescape="" linenos="" startFrom="1" numbersep="0pt" gobble="2" frame="lines" framesep="1mm"}
+void accelerate_kernel( const ACC<double> &density0, const ACC<double> &volume,
+                ACC<double> &stepbymass, const ACC<double> &xvel0, ACC<double> &xvel1,
+                const ACC<double> &xarea, const ACC<double> &pressure,
+                const ACC<double> &yvel0, ACC<double> &yvel1,
+                const ACC<double> &yarea, const ACC<double> &viscosity) {
+
+  double nodal_mass;
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  nodal_mass = ( density0(-1,-1) * volume(-1,-1)
+    + density0(0,-1) * volume(0,-1)
+    + density0(0,0) * volume(0,0)
+    + density0(-1,0) * volume(-1,0) ) * 0.25;
+
+  stepbymass(0,0) = 0.5*dt/ nodal_mass;
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, 0,-1};
+
+  xvel1(0,0) = xvel0(0,0) - stepbymass(0,0) *
+            ( xarea(0,0)  * ( pressure(0,0) - pressure(-1,0) ) +
+              xarea(0,-1) * ( pressure(0,-1) - pressure(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, -1,0};
+
+  yvel1(0,0) = yvel0(0,0) - stepbymass(0,0) *
+            ( yarea(0,0)  * ( pressure(0,0) - pressure(0,-1) ) +
+              yarea(-1,0) * ( pressure(-1,0) - pressure(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, 0,-1};
+
+  xvel1(0,0) = xvel1(0,0) - stepbymass(0,0) *
+            ( xarea(0,0) * ( viscosity(0,0) - viscosity(-1,0) ) +
+              xarea(0,-1) * ( viscosity(0,-1) - viscosity(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, -1,0};
+
+  yvel1(0,0) = yvel1(0,0) - stepbymass(0,0) *
+            ( yarea(0,0) * ( viscosity(0,0) - viscosity(0,-1) ) +
+              yarea(-1,0) * ( viscosity(-1,0) - viscosity(-1,-1) ) );
+
+
+}
+```
+
+[\[fig:example\]]{#fig:example label="fig:example"}
+
+\
+\
+\
+\
+This user kernel is then used in an `ops_par_loop` (Figure
+[\[fig:parloop\]](#fig:parloop){reference-type="ref"
+reference="fig:parloop"}). The key aspect to note in the user kernel in
+Figure [\[fig:example\]](#fig:example){reference-type="ref"
+reference="fig:example"} is the use of the ACC\<\> objects and their
+parentheses operator. These specify the stencil in accessing the
+elements of the respective data arrays.
+
+``` {.cpp mathescape="" linenos="" startFrom="1" numbersep="0pt" gobble="2" frame="lines" framesep="2mm"}
+    int rangexy_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1};
+
+    ops_par_loop(accelerate_kernel, "accelerate_kernel", clover_grid, 2, rangexy_inner_plus1,
+     ops_arg_dat(density0, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(volume, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE),
+     ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_READ),
+     ops_arg_dat(xvel1, 1, S2D_00, "double", OPS_INC),
+     ops_arg_dat(xarea, 1, S2D_00_0M1, "double", OPS_READ),
+     ops_arg_dat(pressure, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_READ),
+     ops_arg_dat(yvel1, 1, S2D_00, "double", OPS_INC),
+     ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ),
+     ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ));
+```
+
+[\[fig:parloop\]]{#fig:parloop label="fig:parloop"}
+
+::: thebibliography
+1 OP2 for Many-Core Platforms, 2013.
+<http://www.oerc.ox.ac.uk/projects/op2>
+
+Istvan Z. Reguly, G.R. Mudalige, Mike B. Giles. Loop Tiling in
+Large-Scale Stencil Codes at Run-time with OPS. (2017) IEEE Transactions
+on Parallel and Distributed Systems.
+<http://dx.doi.org/10.1109/TPDS.2017.2778161>
+:::

From 66bab57de9f4312990f3920f2c938b2753702ec6 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:08:29 +0100
Subject: [PATCH 164/324] Update index.rst

Update highlevel structure
---
 doc/index.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/index.rst b/doc/index.rst
index d4f72096a9..beef678a88 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -11,8 +11,12 @@ Welcome to Test's documentation!
    :caption: Contents:
 
    introduction.md
-   keyconcept.md
    installation.md
+   devanapp.md
+   keyconcept.md
+   opsapi.md
+   devdoc.md
+   pubs.md
 
 
 

From 996ac6ba0826c8058c1eccab40e3e5e89c52355d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:10:46 +0100
Subject: [PATCH 165/324] Create devanapp.md

new file
---
 doc/devanapp.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 doc/devanapp.md

diff --git a/doc/devanapp.md b/doc/devanapp.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/doc/devanapp.md
@@ -0,0 +1 @@
+

From 99951e3c702532a78aa12a2eeeb7e9eccb4750e5 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:11:11 +0100
Subject: [PATCH 166/324] Create opsapi.md

new file
---
 doc/opsapi.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 doc/opsapi.md

diff --git a/doc/opsapi.md b/doc/opsapi.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/doc/opsapi.md
@@ -0,0 +1 @@
+

From 5b5d41911ea9fbcb553df451b10162bc5a74f4b0 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:11:44 +0100
Subject: [PATCH 167/324] Create devdoc.md

new file
---
 doc/devdoc.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 doc/devdoc.md

diff --git a/doc/devdoc.md b/doc/devdoc.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/doc/devdoc.md
@@ -0,0 +1 @@
+

From eb4dff9b6e4f3e2c91e51b03329c225150a67fdd Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:12:16 +0100
Subject: [PATCH 168/324] Create pubs.md

new file
---
 doc/pubs.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 doc/pubs.md

diff --git a/doc/pubs.md b/doc/pubs.md
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/doc/pubs.md
@@ -0,0 +1 @@
+

From ab2416b90a7e70c75346080fb75feaf6de7d7aff Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:13:40 +0100
Subject: [PATCH 169/324] Update conf.py

update copyright
---
 doc/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index 8be05822d8..4d3b1059f1 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Oxford Parallel library for Structured mesh solvers'
-copyright = 'Copyright (c) 2013, Mike Giles and others'
+copyright = 'Copyright (c) 2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
 author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
 
 # The full version, including alpha/beta/rc tags
@@ -57,4 +57,4 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
\ No newline at end of file
+html_static_path = ['_static']

From 26c4d6edb63b0b2437cfcb53c82cdb93a2c2fa25 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:15:57 +0100
Subject: [PATCH 170/324] Update conf.py

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index 4d3b1059f1..c53bf5a553 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Oxford Parallel library for Structured mesh solvers'
-copyright = 'Copyright (c) 2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
+copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
 author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
 
 # The full version, including alpha/beta/rc tags

From 2fdf32b86c7e66ce04e2ea46fc398691daeb2890 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:18:56 +0100
Subject: [PATCH 171/324] Update opsapi.md

moving key concepts
---
 doc/opsapi.md | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 8b13789179..be6f08cd98 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -1 +1,42 @@
 
+Many of the API and library follows the structure of the OP2 high-level
+library for unstructured mesh applications [@op2]. However the
+structured mesh domain is distinct from the unstructured mesh
+applications domain due to the implicit connectivity between
+neighbouring mesh elements (such as vertices, cells) in structured
+meshes/grids. The key idea is that operations involve looping over a
+"rectangular" multi-dimensional set of grid points using one or more
+"stencils" to access data. In multi-block grids, we have several
+structured blocks. The connectivity between the faces of different
+blocks can be quite complex, and in particular they may not be oriented
+in the same way, i.e. an $i,j$ face of one block may correspond to the
+$j,k$ face of another block. This is awkward and hard to handle simply.
+
+To clarify some of the important issues in designing the API, we note
+here some needs connected with a 3D application:
+
+-   When looping over the interior with loop indices $i,j,k$, often
+    there are 1D arrays which are referenced using just one of the
+    indices.
+
+-   To implement boundary conditions, we often loop over a 2D face,
+    accessing both the 3D dataset and data from a 2D dataset.
+
+-   To implement periodic boundary conditions using dummy "halo" points,
+    we sometimes have to copy one plane of boundary data to another.
+    e.g. if the first dimension has size $I$ then we might copy the
+    plane $i=I\!-\!2$ to plane $i=0$, and plane $i=1$ to plane
+    $i=I\!-\!1$.
+
+-   In multigrid, we are working with two grids with one having twice as
+    many points as the other in each direction. To handle this we
+    require a stencil with a non-unit stride.
+
+-   In multi-block grids, we have several structured blocks. The
+    connectivity between the faces of different blocks can be quite
+    complex, and in particular they may not be oriented in the same way,
+    i.e. an $i,j$ face of one block may correspond to the $j,k$ face of
+    another block. This is awkward and hard to handle simply.
+
+The latest proposal is to handle all of these different requirements
+through stencil definitions.

From 8b62d295d3825de4d6a12f1bf590925c793b4330 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:19:42 +0100
Subject: [PATCH 172/324] Update introduction.md

Introduction section structure update
---
 doc/introduction.md | 45 ++++-----------------------------------------
 1 file changed, 4 insertions(+), 41 deletions(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 515fcee001..d7d6a499c3 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -7,44 +7,7 @@ of an unstructured collection of structured meshes/grids. This document
 describes the OPS C++ API, which supports the development of
 single-block and multi-block structured meshes.
 
-Many of the API and library follows the structure of the OP2 high-level
-library for unstructured mesh applications [@op2]. However the
-structured mesh domain is distinct from the unstructured mesh
-applications domain due to the implicit connectivity between
-neighbouring mesh elements (such as vertices, cells) in structured
-meshes/grids. The key idea is that operations involve looping over a
-"rectangular" multi-dimensional set of grid points using one or more
-"stencils" to access data. In multi-block grids, we have several
-structured blocks. The connectivity between the faces of different
-blocks can be quite complex, and in particular they may not be oriented
-in the same way, i.e. an $i,j$ face of one block may correspond to the
-$j,k$ face of another block. This is awkward and hard to handle simply.
-
-To clarify some of the important issues in designing the API, we note
-here some needs connected with a 3D application:
-
--   When looping over the interior with loop indices $i,j,k$, often
-    there are 1D arrays which are referenced using just one of the
-    indices.
-
--   To implement boundary conditions, we often loop over a 2D face,
-    accessing both the 3D dataset and data from a 2D dataset.
-
--   To implement periodic boundary conditions using dummy "halo" points,
-    we sometimes have to copy one plane of boundary data to another.
-    e.g. if the first dimension has size $I$ then we might copy the
-    plane $i=I\!-\!2$ to plane $i=0$, and plane $i=1$ to plane
-    $i=I\!-\!1$.
-
--   In multigrid, we are working with two grids with one having twice as
-    many points as the other in each direction. To handle this we
-    require a stencil with a non-unit stride.
-
--   In multi-block grids, we have several structured blocks. The
-    connectivity between the faces of different blocks can be quite
-    complex, and in particular they may not be oriented in the same way,
-    i.e. an $i,j$ face of one block may correspond to the $j,k$ face of
-    another block. This is awkward and hard to handle simply.
-
-The latest proposal is to handle all of these different requirements
-through stencil definitions.
\ No newline at end of file
+## Overview
+## Licencing
+## Citing
+## Support

From 7e1c7cf5788ec5f679801b311a6a15a958ecae48 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:29:12 +0100
Subject: [PATCH 173/324] Update installation.md

structure for the installation.md file
---
 doc/installation.md | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index c46de11093..faa8765b0a 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -36,9 +36,11 @@
   * `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
   * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
 
+## Obtaining OPS
 
 ## Build OPS back-end libraries example applications
-### Build the library and example applications together
+### Using `cmake`
+#### Build the library and example applications together
 
   Create a build directory, and run CMake (version 3.18 or newer)
   ```bash
@@ -51,7 +53,7 @@
   ```
 After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
 
-###  Build the library and example applications separately
+####  Build the library and example applications separately
 
 In this mode, the library can be firstly built and installed as
 
@@ -72,14 +74,14 @@ then the application can be built as
   cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
   make # IEEE=1 this option is important for applications to get accurate results
   ```
-### Tests
+#### Tests
 
 A few tasks for testing codes can be run by
 ```bash
   make test
   ```
 The current tests are mainly based on the applications.
-### Options of interest to specify to `cmake` include:
+#### `cmake` options
 
   * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
   * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
@@ -92,3 +94,15 @@ The current tests are mainly based on the applications.
   <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
   <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
 
+### Using regular `Makefiles`
+#### Build library
+#### Build application
+#### Makefile options
+
+## Running example applications
+### CloverLeaf
+### CloverLeaf_3D_HDF5
+### poisson
+### adi
+
+## Runtime flags and options

From 9af6b393b28cfce5db39c3cfdc4445b50637a010 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:32:40 +0100
Subject: [PATCH 174/324] Update installation.md

---
 doc/installation.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index faa8765b0a..9ba6bbf668 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -1,4 +1,4 @@
-# Installation
+# Getting Started
 
 **Note: The current CMakefile and relevant instructions are mainly tested on linux-based systems including Windows Subsystem for Linux**
 
@@ -39,7 +39,7 @@
 ## Obtaining OPS
 
 ## Build OPS back-end libraries example applications
-### Using `cmake`
+### Using cmake
 #### Build the library and example applications together
 
   Create a build directory, and run CMake (version 3.18 or newer)
@@ -94,7 +94,7 @@ The current tests are mainly based on the applications.
   <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
   <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
 
-### Using regular `Makefiles`
+### Using regular Makefiles
 #### Build library
 #### Build application
 #### Makefile options

From c768ff6e37862265dbcc207d4989961e634a32e7 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:33:58 +0100
Subject: [PATCH 175/324] Update installation.md

---
 doc/installation.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 9ba6bbf668..a3230a1153 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -38,7 +38,7 @@
 
 ## Obtaining OPS
 
-## Build OPS back-end libraries example applications
+## Build OPS back-end libraries and example applications
 ### Using cmake
 #### Build the library and example applications together
 
@@ -95,8 +95,8 @@ The current tests are mainly based on the applications.
   <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
 
 ### Using regular Makefiles
-#### Build library
-#### Build application
+#### Build back-end library
+#### Build an application
 #### Makefile options
 
 ## Running example applications

From 2dba1a0c3393ceb87fff5e95073c6780094ec03a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:39:50 +0100
Subject: [PATCH 176/324] Update index.rst

---
 doc/index.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/index.rst b/doc/index.rst
index beef678a88..0dcc8c4007 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -13,7 +13,6 @@ Welcome to Test's documentation!
    introduction.md
    installation.md
    devanapp.md
-   keyconcept.md
    opsapi.md
    devdoc.md
    pubs.md

From 50ebfecc9bb9c08444d2ddcf569404a5ca7d5fb5 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:41:20 +0100
Subject: [PATCH 177/324] Update opsapi.md

---
 doc/opsapi.md | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index be6f08cd98..4a514efa42 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -1,3 +1,106 @@
+# Key concepts and structure
+
+An OPS application can generally be divided into two key parts:
+initialisation and parallel execution. During the initialisation phase,
+one or more blocks (ops_block) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a
+block, and have a specific size (in each dimension of the block), which
+may be slightly different across different datasets (e.g. staggered
+grids), in some directions they may be degenerate (a size of 1), or they
+can represent data associated with different multigrid levels (where
+their size if a multiple or a fraction of other datasets). Datasets can
+be declared with empty (NULL) pointers, then OPS will allocate the
+appropriate amount of memory, may be passed non-NULL pointers (currently
+only supported in non-MPI environments), in which case OPS will assume
+the memory is large enough for the data and the block halo, and there
+are HDF5 dataset declaration routines which allow the distributed
+reading of datasets from HDF5 files. The concept of blocks is necessary
+to group datasets together, as in a multi-block problem, in a
+distributed memory environment, OPS needs to be able to determine how to
+decompose the problem.
+
+The initialisation phase usually also consists of defining the stencils
+to be used later on (though they can be defined later as well), which
+describe the data access patterns used in parallel loops. Stencils are
+always relative to the "current" point; e.g. if at iteration $(i,j)$, we
+wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two
+points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in
+one of the dimensions the dataset's size is 1), as well as for
+multigrid, there are special strided, restriction, and prolongation
+stencils: they differ from normal stencils in that as one steps through
+a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a
+degenerate dataset called xcoords, size $(N,1)$, then we will need a
+stencil with stride $(1,0)$ to access it in a regular 2D loop.
+
+Finally, the initialisation phase may declare a number of global
+constants - these are variables in global scope that can be accessed
+from within user kernels, without having to pass them in explicitly.
+These may be scalars or small arrays, generally for values that do not
+change during execution, though they may be updated during execution
+with repeated calls to `ops_decl_const`.
+
+The initialisation phase is terminated by a call to `ops_partition`.
+
+The bulk of the application consists of parallel loops, implemented
+using calls to `ops_par_loop`. These constructs work with datasets,
+passed through the opaque `ops_dat` handles declared during the
+initialisation phase. The iterations of parallel loops are semantically
+independent, and it is the responsibility of the user to enforce this:
+the order in which iterations are executed cannot affect the result
+(within the limits of floating point precision). Parallel loops are
+defined on a block, with a prescribed iteration range that is always
+defined from the perspective of the dataset written/modified (the sizes
+of datasets, particularly in multigrid situations, may be very
+different). Datasets are passed in using `ops_arg_dat`, and during
+execution, values at the current grid point will be passed to the user
+kernel. These values are passed wrapped in a templated `ACC<>` object
+(templated on the type of the data), whose parentheses operator is
+overloaded, which the user must use to specify the relative offset to
+access the grid point's neighbours (which accesses have to match the the
+declared stencil). Datasets written may only be accessed with a
+one-point, zero-offset stencil (otherwise the parallel semantics may be
+violated).
+
+Other than datasets, one can pass in read-only scalars or small arrays
+that are iteration space invariant with `ops_arg_gbl` (typically
+weights, $\delta t$, etc. which may be different in different loops).
+The current iteration index can also be passed in with `ops_arg_idx`,
+which will pass a globally consistent index to the user kernel (i.e.
+also under MPI).
+
+Reductions in loops are done using the ops_arg_reduce argument, which
+takes a reduction handle as an argument. The result of the reduction can
+then be acquired using a separate call to `ops_reduction_result`. The
+semantics are the following: a reduction handle after it was declared is
+in an "uninitialised" state. The first time it is used as an argument to
+a loop, its type is determined (increment/min/max), and is initialised
+appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in
+parallel loops are combined together, up until the point, where the
+result is acquired using `ops_reduction_result`, which then sets it back
+to an uninitialised state. This also implies, that different parallel
+loops, which all use the same reduction handle, but are otherwise
+independent, are independent and their partial reduction results can be
+combined together associatively and commutatively.
+
+OPS takes responsibility for all data, its movement and the execution of
+parallel loops. With different execution hardware and optimisations,
+this means OPS will re-organise data as well as execution (potentially
+across different loops), and therefore any data accesses or manipulation
+may only be done through the OPS API.
+
+This restriction is exploited by a lazy execution mechanism in OPS. The
+idea is that OPS API calls that do not return a result can be not
+executed immediately, rather queued, and once an API call requires
+returning some data, operations in the queue are executed, and the
+result is returned. This allows OPS to analyse and optimise operations
+in the queue together. This mechanism is fully automated by OPS, and is
+used with the various \_tiled executables. For more information on how
+to use this mechanism for improving CPU performance, see Section
+[\[sec:tiling\]](#sec:tiling){reference-type="ref"
+reference="sec:tiling"}. Some API calls triggering the execution of
+queued operations include ops_reduction_result, and the functions in the
+data access API.
+
 
 Many of the API and library follows the structure of the OP2 high-level
 library for unstructured mesh applications [@op2]. However the
@@ -40,3 +143,6 @@ here some needs connected with a 3D application:
 
 The latest proposal is to handle all of these different requirements
 through stencil definitions.
+
+# OPS API - C++
+# OPS API - Fortran

From 5d00c766b01806383ed4ece18d99588de5b3bb6b Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:42:04 +0100
Subject: [PATCH 178/324] Update opsapi.md

---
 doc/opsapi.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 4a514efa42..942d5dee10 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -144,5 +144,5 @@ here some needs connected with a 3D application:
 The latest proposal is to handle all of these different requirements
 through stencil definitions.
 
-# OPS API - C++
-# OPS API - Fortran
+# OPS API 
+

From be033346aab139963874dc0ef982c76551962b24 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:43:16 +0100
Subject: [PATCH 179/324] Update opsapi.md

---
 doc/opsapi.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 942d5dee10..408bf9e341 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -1,4 +1,6 @@
-# Key concepts and structure
+# OPS API
+
+## Key concepts and structure
 
 An OPS application can generally be divided into two key parts:
 initialisation and parallel execution. During the initialisation phase,
@@ -144,5 +146,5 @@ here some needs connected with a 3D application:
 The latest proposal is to handle all of these different requirements
 through stencil definitions.
 
-# OPS API 
+## OPS C++ API 
 

From cbc2a919a114a8f4e00d2bb61e1dc41f32611973 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:46:25 +0100
Subject: [PATCH 180/324] Update devanapp.md

---
 doc/devanapp.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 8b13789179..62ce0a66c0 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1 +1,5 @@
-
+# Developing and OPS Application
+## Tutorial
+## Supported paralleizations
+## Code-generation flags
+## File I/O

From 7ce14ffc416e7000c39f117f74bb2207e79f7c7d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:50:00 +0100
Subject: [PATCH 181/324] Update devdoc.md

---
 doc/devdoc.md | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index 8b13789179..de190feabd 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -1 +1,11 @@
-
+# Developer Guide 
+## Code-generator
+### Frontend API parser
+### Target Parallel Templates
+### Elemental Kernel Transformations
+## Back-end library
+### Sequential and multi-threaded CPU
+### MPI and Partitioning 
+### HDF5 
+### CUDA
+### Cache blocking tiling and comm-avoiding optimizations

From e3cb17cef0475030a064d559e87d1367870191cb Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:50:24 +0100
Subject: [PATCH 182/324] Update pubs.md

---
 doc/pubs.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/pubs.md b/doc/pubs.md
index 8b13789179..9aca12e2f5 100644
--- a/doc/pubs.md
+++ b/doc/pubs.md
@@ -1 +1,2 @@
 
+# Publications

From 5f4f14e35bc32195a559597e804d0f64bde2771e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 22:55:52 +0100
Subject: [PATCH 183/324] Update installation.md

---
 doc/installation.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index a3230a1153..626c246412 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -38,8 +38,8 @@
 
 ## Obtaining OPS
 
-## Build OPS back-end libraries and example applications
-### Using cmake
+## Build OPS Back-end Libraries and Example Applications
+### Using Cmake
 #### Build the library and example applications together
 
   Create a build directory, and run CMake (version 3.18 or newer)
@@ -94,15 +94,15 @@ The current tests are mainly based on the applications.
   <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
   <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
 
-### Using regular Makefiles
+### Using Makefiles
 #### Build back-end library
 #### Build an application
 #### Makefile options
 
-## Running example applications
+## Running Example Applications
 ### CloverLeaf
 ### CloverLeaf_3D_HDF5
 ### poisson
 ### adi
 
-## Runtime flags and options
+## Runtime Flags and Options

From cd57cc6f0920a1cb3ae82ed90a2076e48aec6c9e Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:03:24 +0100
Subject: [PATCH 184/324] Update installation.md

---
 doc/installation.md | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 626c246412..3f77adc265 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -4,9 +4,9 @@
 
 ## Dependencies
 
-  * CMake
+  **CMake**
 
-  CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
+CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
   ```bash
   version=3.19.0
   wget https://github.com/Kitware/CMake/releases/download/v$version/cmake-$version-Linux-x86_64.sh
@@ -18,28 +18,22 @@
   sudo ln -s $cmake_dir/bin/cmake /usr/local/bin/cmake
   ```
 
-  * Python2
+ **Python2**
 
-  **Python2** is required by the OPS Python translator. The CMake build system will try to identify it automatically. However, the process can fail sometime (e.g., if there are both Python2 and Python3 installed). If this happens, the path to Python2 can be specified manually by using **-DPython2_EXECUTABLE** when invoking CMake
+Python2 is required by the OPS Python translator. The CMake build system will try to identify it automatically. However, the process can fail sometime (e.g., if there are both Python2 and Python3 installed). If this happens, the path to Python2 can be specified manually by using `-DPython2_EXECUTABLE` when invoking CMake
 
-  * HDF5
+ **HDF5**
 
-  [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
+[HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
 
-  * CUDA
+ **CUDA**
 
-  The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
-
-<!-- 1. Set up environmental variables:
-
-  * `CUDA_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications, only needed if CUDA cannot be found in standard locations, or to enable OpenCL)
-  * `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
-  * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
+The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
 
 ## Obtaining OPS
 
 ## Build OPS Back-end Libraries and Example Applications
-### Using Cmake
+### Using cmake
 #### Build the library and example applications together
 
   Create a build directory, and run CMake (version 3.18 or newer)
@@ -74,14 +68,16 @@ then the application can be built as
   cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
   make # IEEE=1 this option is important for applications to get accurate results
   ```
-#### Tests
+<!-- #### Tests
 
 A few tasks for testing codes can be run by
 ```bash
   make test
   ```
+-->
+
 The current tests are mainly based on the applications.
-#### `cmake` options
+#### cmake options
 
   * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
   * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
@@ -94,6 +90,11 @@ The current tests are mainly based on the applications.
   <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
   <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
 
+<!-- 1. Set up environmental variables:
+* `CUDA_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications, only needed if CUDA cannot be found in standard locations, or to enable OpenCL)
+* `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
+* `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
+
 ### Using Makefiles
 #### Build back-end library
 #### Build an application

From 883b09201e7ad06e96fd6d402c4a6375da7f20ed Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:17:29 +0100
Subject: [PATCH 185/324] Update installation.md

---
 doc/installation.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index 3f77adc265..8313a36213 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -31,7 +31,10 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
 
 ## Obtaining OPS
-
+```bash
+git clone https://github.com/gihanmudalige/OPS.git
+```
+    
 ## Build OPS Back-end Libraries and Example Applications
 ### Using cmake
 #### Build the library and example applications together

From 14ead12494d7d8d82d5f615651cd8e5af11f5204 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:18:05 +0100
Subject: [PATCH 186/324] Update installation.md

---
 doc/installation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 8313a36213..ba3ea88b30 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -24,11 +24,11 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 
  **HDF5**
 
-[HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
+[HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using `-DHDF5_ROOT`.
 
  **CUDA**
 
-The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
+The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
 ## Obtaining OPS
 ```bash

From 65e2ce43a582a58c2048fdb5e26778a861c7f023 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:24:34 +0100
Subject: [PATCH 187/324] Update installation.md

---
 doc/installation.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index ba3ea88b30..f843facf2d 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -35,9 +35,9 @@ The CMake build system will detect the tookit automatically. If the automatic pr
 git clone https://github.com/gihanmudalige/OPS.git
 ```
     
-## Build OPS Back-end Libraries and Example Applications
+## Build OPS
 ### Using cmake
-#### Build the library and example applications together
+#### Build library and example applications together
 
   Create a build directory, and run CMake (version 3.18 or newer)
   ```bash
@@ -50,7 +50,7 @@ git clone https://github.com/gihanmudalige/OPS.git
   ```
 After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
 
-####  Build the library and example applications separately
+####  Build library and example applications separately
 
 In this mode, the library can be firstly built and installed as
 

From 88a51e178ba36df1a858e49ce19493a1a20b3db5 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:25:26 +0100
Subject: [PATCH 188/324] Update installation.md

---
 doc/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index f843facf2d..69b6045554 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -77,9 +77,9 @@ A few tasks for testing codes can be run by
 ```bash
   make test
   ```
+The current tests are mainly based on the applications.
 -->
 
-The current tests are mainly based on the applications.
 #### cmake options
 
   * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations

From 126bbe85ee53a2f9c144db0ace1075d70fb4b89b Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:28:36 +0100
Subject: [PATCH 189/324] Update installation.md

---
 doc/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index 69b6045554..abaa5995f6 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -48,7 +48,7 @@ git clone https://github.com/gihanmudalige/OPS.git
   make # IEEE=1 this option is important for applications to get accurate results
   make install # sudo is needed if a directory like /usr/local/ is chosen.
   ```
-After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
+After installation, the library and the python translator can be found at the direcory specified by `CMAKE_INSTALL_PREFIX`, together with the executable files for applications at `APP_INSTALL_DIR`.
 
 ####  Build library and example applications separately
 

From 878a86c1a2d52681cf87d8cad4a53b36ab1fdcad Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:30:49 +0100
Subject: [PATCH 190/324] Update devanapp.md

---
 doc/devanapp.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 62ce0a66c0..37d3d1d069 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,4 +1,4 @@
-# Developing and OPS Application
+# Developing an OPS Application
 ## Tutorial
 ## Supported paralleizations
 ## Code-generation flags

From a18ffe373d0f9c3ad14307b3fa7e29fe06a026c7 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:32:20 +0100
Subject: [PATCH 191/324] Update devanapp.md

---
 doc/devanapp.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 37d3d1d069..3004738caf 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,5 +1,5 @@
 # Developing an OPS Application
 ## Tutorial
-## Supported paralleizations
-## Code-generation flags
+## Supported Paralleizations
+## Code-generation Flags
 ## File I/O

From 83184631374d38e036e74f68650ec01d26bb28e1 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 24 Sep 2021 23:32:54 +0100
Subject: [PATCH 192/324] Update devdoc.md

---
 doc/devdoc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index de190feabd..5e906c5729 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -3,7 +3,7 @@
 ### Frontend API parser
 ### Target Parallel Templates
 ### Elemental Kernel Transformations
-## Back-end library
+## Back-end Library
 ### Sequential and multi-threaded CPU
 ### MPI and Partitioning 
 ### HDF5 

From 99562bf6e0f11b9faf56dc3da46c7d804541aad4 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 09:57:52 +0100
Subject: [PATCH 193/324] Create AUTHORS

---
 AUTHORS | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 AUTHORS

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000000..2d43b90743
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,10 @@
+  
+List of Authors 
+
+Mike Giles
+Gihan Mudalige
+Istvan Reguly
+Daniel Balogh
+Toby Flynn
+Satya Jammy
+Jianping Meng 

From eb41efa4398f72a5dc11018f3dd4a302a7b78da1 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:33:46 +0100
Subject: [PATCH 194/324] Update installation.md

---
 doc/installation.md | 58 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index abaa5995f6..3e92a58771 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -55,28 +55,28 @@ After installation, the library and the python translator can be found at the di
 In this mode, the library can be firstly built and installed as
 
 ```bash
-  mkdir build
-  cd build
-  # Please see below for CMake options
-  cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
-  make # IEEE=1 this option is important for applications to get accurate results
-  make install # sudo is needed if a system direction is chosen,
-  ```
+mkdir build
+cd build
+# Please see below for CMake options
+cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
+make # IEEE=1 this option is important for applications to get accurate results
+make install # sudo is needed if a system direction is chosen,
+```
 then the application can be built as
 
 ```bash
-  mkdir appbuild
-  cd appbuild
-  # Please see below for CMake options
-  cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
-  make # IEEE=1 this option is important for applications to get accurate results
-  ```
+mkdir appbuild
+cd appbuild
+# Please see below for CMake options
+cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
+make # IEEE=1 this option is important for applications to get accurate results
+```
 <!-- #### Tests
 
 A few tasks for testing codes can be run by
 ```bash
-  make test
-  ```
+make test
+```
 The current tests are mainly based on the applications.
 -->
 
@@ -99,8 +99,34 @@ The current tests are mainly based on the applications.
 * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
 
 ### Using Makefiles
+#### Set up environmental variables:
+
+  * `OPS_COMPILER` - compiler to be used (Currently supports Intel, PGI and Cray compilers, but others can be easily incorporated by extending the Makefiles used in step 2 and 3)
+  * `OPS_INSTALL_PATH` - Installation directory of OPS/ops
+  * `CUDA_INSTALL_PATH - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications)
+  * `OPENCL_INSTALL_PATH` - Installation directory of OpenCL, usually `/usr/local/cuda` for NVIDIA OpenCL implementation (to build OpenCL libs and applications)
+  * `MPI_INSTALL_PATH` - Installation directory of MPI (to build MPI based distributed memory libs and applications)
+  * `HDF5_INSTALL_PATH` - Installation directory of HDF5 (to support HDF5 based File I/O)
+
+See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) under `OPS/ops/` that sets up the environment for building with various compilers (Intel, PGI, Cray).
+
 #### Build back-end library
-#### Build an application
+For C/C++ back-end use Makefile under `OPS/ops/c` (modify Makefile if required). The libraries will be built in `OPS/ops/c/lib`
+```bash
+cd $OPS_INSTALL_PATH/c
+make
+```
+For Fortran back-end use Makefile under `OPS/ops/fortran` (modify Makefile if required). The libraries will be built in `OPS/ops/fortran/lib`
+```bash
+cd $OPS_INSTALL_PATH/fortran
+make
+```
+#### Build exampe applications
+For example to build CloverLeaf_3D under `OPS/apps/c/CloverLeaf_3D`
+```bash  
+cd ../apps/c/Cloverleaf_3D/
+make
+```  
 #### Makefile options
 
 ## Running Example Applications

From 2f3613df9ce18b5aab8f51adc0127b9c44570989 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:36:35 +0100
Subject: [PATCH 195/324] Update installation.md

---
 doc/installation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 3e92a58771..73c09bec39 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -108,7 +108,7 @@ The current tests are mainly based on the applications.
   * `MPI_INSTALL_PATH` - Installation directory of MPI (to build MPI based distributed memory libs and applications)
   * `HDF5_INSTALL_PATH` - Installation directory of HDF5 (to support HDF5 based File I/O)
 
-See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) under `OPS/ops/` that sets up the environment for building with various compilers (Intel, PGI, Cray).
+See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) under `OPS/ops/scripts` that sets up the environment for building with various compilers (Intel, PGI, Cray).
 
 #### Build back-end library
 For C/C++ back-end use Makefile under `OPS/ops/c` (modify Makefile if required). The libraries will be built in `OPS/ops/c/lib`
@@ -127,7 +127,7 @@ For example to build CloverLeaf_3D under `OPS/apps/c/CloverLeaf_3D`
 cd ../apps/c/Cloverleaf_3D/
 make
 ```  
-#### Makefile options
+<!---#### Makefile options -->
 
 ## Running Example Applications
 ### CloverLeaf

From 580f52954eb64c7f3cfedc6cfc770806c6f55cd7 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:41:18 +0100
Subject: [PATCH 196/324] Update installation.md

---
 doc/installation.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/installation.md b/doc/installation.md
index 73c09bec39..95c787fae4 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -30,6 +30,7 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 
 The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
+
 ## Obtaining OPS
 ```bash
 git clone https://github.com/gihanmudalige/OPS.git

From e4aaa00545ba0c5b8b4764fc950c8225a32c340f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:45:15 +0100
Subject: [PATCH 197/324] Create apps.md

---
 doc/apps.md | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 doc/apps.md

diff --git a/doc/apps.md b/doc/apps.md
new file mode 100644
index 0000000000..efcea9dbfa
--- /dev/null
+++ b/doc/apps.md
@@ -0,0 +1,4 @@
+# Example Applications
+## CloverLeaf  (2D, 3D and HDF5)
+## poisson
+## adi

From 92075eaab6c6e8f030afc31b82c0e62fe81c3fda Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:45:40 +0100
Subject: [PATCH 198/324] Update index.rst

---
 doc/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/index.rst b/doc/index.rst
index 0dcc8c4007..73f406e704 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -14,6 +14,7 @@ Welcome to Test's documentation!
    installation.md
    devanapp.md
    opsapi.md
+   apps.md
    devdoc.md
    pubs.md
 

From fd268b5287f7e3a676c41185a6dcbe1f36a10eb8 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:45:48 +0100
Subject: [PATCH 199/324] Update installation.md

---
 doc/installation.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 95c787fae4..f69ec8c08c 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -130,10 +130,4 @@ make
 ```  
 <!---#### Makefile options -->
 
-## Running Example Applications
-### CloverLeaf
-### CloverLeaf_3D_HDF5
-### poisson
-### adi
-
 ## Runtime Flags and Options

From df98597e866d9ff0027780f5a27ce12f1b6e59fd Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:51:16 +0100
Subject: [PATCH 200/324] Update conf.py

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index c53bf5a553..c92c236874 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Oxford Parallel library for Structured mesh solvers'
-copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
+copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and [others](https://github.com/OP-DSL/OPS/blob/master/AUTHORS)'
 author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
 
 # The full version, including alpha/beta/rc tags

From 0f560b5b4c66e1d4fc43d6d802a76570884d181d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:54:58 +0100
Subject: [PATCH 201/324] Update conf.py

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index c92c236874..10044a49ca 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Oxford Parallel library for Structured mesh solvers'
-copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and [others](https://github.com/OP-DSL/OPS/blob/master/AUTHORS)'
+copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and [https://github.com/OP-DSL/OPS/blob/master/AUTHORS]'
 author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
 
 # The full version, including alpha/beta/rc tags

From 9dea64486bb1087538b0df2bcdde233a11f8a2cf Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 10:56:25 +0100
Subject: [PATCH 202/324] Update conf.py

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index 10044a49ca..c53bf5a553 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'Oxford Parallel library for Structured mesh solvers'
-copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and [https://github.com/OP-DSL/OPS/blob/master/AUTHORS]'
+copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
 author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
 
 # The full version, including alpha/beta/rc tags

From 09839287b74ee1a1ed879409161259f465033692 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 11:02:09 +0100
Subject: [PATCH 203/324] Delete README

---
 README | 75 ----------------------------------------------------------
 1 file changed, 75 deletions(-)
 delete mode 100644 README

diff --git a/README b/README
deleted file mode 100644
index f7aa601f52..0000000000
--- a/README
+++ /dev/null
@@ -1,75 +0,0 @@
-OPS is an API with associated libraries and pre-processors to generate
-parallel executables for applications on mulit-block structured grids.
-
-This repository contains the implementation of the run-time library
-and the pre-processor, and is structured as follows:
-
-|
-`- ops: Implementation of the user and run-time OPS C/C++ APIs
-|
-`- apps: Application examples in C and Fortran
-|  These are examples of user application code and also include
-|  the target code an OPS pre-processor should produce to correctly
-|  use the OPS run-time library.
-|
-`- translator: Python OPS pre-processor for C/C++ API
-|
-`- doc: Documentation
-
-Installation
-============
-
-1. Set up environmental variables:
-
-  OPS_COMPILER - compiler to be used (Currently supports Intel, PGI and 
-  Cray compilers, but others can be easily incorporated by extending the 
-  Makefiles used in step 2 and 3)
-
-  OPS_INSTALL_PATH - Installation directory of OPS/ops
-  
-  CUDA_INSTALL_PATH - Installation directory of CUDA, 
-  usually /usr/local/cuda (to build CUDA libs and applications)
-
-  OPENCL_INSTALL_PATH - Installation directory of OpenCL, 
-  usually /usr/local/cuda for NVIDIA OpenCL implementation 
-  (to build OpenCL libs and applications)
-
-  MPI_INSTALL_PATH - Installation directory of MPI (to build MPI 
-  based distributed memory libs and applications)
-
-  HDF5_INSTALL_PATH - Installation directory of HDF5 
-  (to support HDF5 based File I/O)
-
-  See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) 
-  under OPS/ops/ that sets up the environment for building with various 
-  compilers (Intel, PGI, Cray).
-
-2. Build OPS back-end libraries.
-
-  For C/C++ back-end use Makefile under OPS/ops/c (modify Makefile if required). 
-  The libraries will be built in OPS/ops/c/lib
-  
-  cd $OPS_INSTALL_PATH/c
-  make
-
-  
-  For Fortran back-end use Makefile under OPS/ops/fortran 
-  (modify Makefile if required). The libraries will be built in OPS/ops/fortran/lib
-  
-  cd $OPS_INSTALL_PATH/fortran
-  make
-  
-
-3. Build OPS example applications
-
-  For example to build CloverLeaf_3D under OPS/apps/c/CloverLeaf_3D
-  
-  cd ../apps/c/Cloverleaf_3D/
-  make
-  
-
-How to cite
-===========
-Istvan Z Reguly, G.R Mudalige, Mike B Giles. Loop Tiling in Large-Scale 
-Stencil Codes at Run-time with OPS. (2017) IEEE Transactions on Parallel 
-and Distributed Systems. (http://dx.doi.org/10.1109/TPDS.2017.2778161)

From 834e0d24471bf10b8fa328c34851571deeb48e34 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 11:26:27 +0100
Subject: [PATCH 204/324] Update README.md

---
 README.md | 137 ++++++++++++------------------------------------------
 1 file changed, 31 insertions(+), 106 deletions(-)

diff --git a/README.md b/README.md
index a75182274c..68b548ecfd 100644
--- a/README.md
+++ b/README.md
@@ -1,117 +1,42 @@
-## OPS
+# OPS
 
-OPS is an API with associated libraries and pre-processors to generate
-parallel executables for applications on multi-block structured grids.
+OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
 
+This repository contains the implementation of the back-end library and the code-generator, and is structured as follows:
 
-This repository contains the implementation of the run-time library
-and the pre-processor, and is structured as follows:
+* `ops`: Implementation of the user and run-time OPS C/C++ APIs
+* `apps`: Application examples in C.
+  These are examples of user application code and also include the target parallel code generated by the OPS code generator.
+* `ops_translator`: Python OPS code generator for C/C++ API
+* `scripts` : example scripts for setting environmental variables and testing applications
+* `cmake` : cmake installation files
+* `makefiles` : makefile based installation files
+* `doc`: Documentation
 
-* ops: Implementation of the user and run-time OPS C/C++ APIs
+## Documentation
 
-* apps: Application examples in C.
-  These are examples of user application code and also include
-  the target code an OPS pre-processor should produce to correctly
-  use the OPS run-time library.
-  Currently the main application developed with OPS is a single
-  block structured mesh application - Cloverleaf originally
-  developed at https://github.com/Warwick-PCAV/CloverLeaf
+OPS documentation can be viewed on [Read the Docs](https://ops-dsl.readthedocs.io/).
 
-* translator: Python OPS pre-processor for C/C++ API
+## Citing
+To cite OPS, please reference the following paper:
 
-* doc: Documentation
+[I. Z. Reguly, G. R. Mudalige and M. B. Giles, Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS, in IEEE Transactions on Parallel and Distributed Systems, vol. 29, no. 4, pp. 873-886, 1 April 2018, doi: 10.1109/TPDS.2017.2778161.](https://ieeexplore.ieee.org/abstract/document/8121995)
 
-#### Installation
+```
+@ARTICLE{Reguly_et_al_2018,
+  author={Reguly, István Z. and Mudalige, Gihan R. and Giles, Michael B.},
+  journal={IEEE Transactions on Parallel and Distributed Systems}, 
+  title={Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS}, 
+  year={2018},
+  volume={29},
+  number={4},
+  pages={873-886},
+  doi={10.1109/TPDS.2017.2778161}}
+```
 
-**Note: The current CMakefile and relevant instructions are mainly tested on linux-based systems including Windows Subsystem for Linux**
+## Contact
+If you wish to report a bug with the software, please contact the [OP-DSL team](https://op-dsl.github.io/about.html)
 
-##### Dependencies
-
-  * CMake
-
-  CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
-  ```bash
-  version=3.19.0
-  wget https://github.com/Kitware/CMake/releases/download/v$version/cmake-$version-Linux-x86_64.sh
-  # Assume that CMake is going to be installed at /usr/local/cmake
-  cmake_dir=/usr/local/cmake
-  # sudo is not necessary for directories in user space.
-  sudo mkdir $cmake_dir
-  sudo sh ./cmake-$version-Linux-x86_64.sh --prefix=$cmake_dir  --skip-license
-  sudo ln -s $cmake_dir/bin/cmake /usr/local/bin/cmake
-  ```
-
-  * Python2
-
-  **Python2** is required by the OPS Python translator. The CMake build system will try to identify it automatically. However, the process can fail sometime (e.g., if there are both Python2 and Python3 installed). If this happens, the path to Python2 can be specified manually by using **-DPython2_EXECUTABLE** when invoking CMake
-
-  * HDF5
-
-  [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
-
-  * CUDA
-
-  The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
-
-<!-- 1. Set up environmental variables:
-
-  * `CUDA_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications, only needed if CUDA cannot be found in standard locations, or to enable OpenCL)
-  * `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
-  * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
-
-
-##### Build OPS back-end libraries example applications
-###### Build the library and example applications together
-
-  Create a build directory, and run CMake (version 3.18 or newer)
-  ```bash
-  mkdir build
-  cd build
-  # Please see below for CMake options
-  cmake ${PATH_TO_OPS} -DBUILD_OPS_APPS=ON -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL -DGPU_NUMBER=1
-  make # IEEE=1 this option is important for applications to get accurate results
-  make install # sudo is needed if a directory like /usr/local/ is chosen.
-  ```
-After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
-
-######  Build the library and example applications separately
-
-In this mode, the library can be firstly built and installed as
-
-```bash
-  mkdir build
-  cd build
-  # Please see below for CMake options
-  cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
-  make # IEEE=1 this option is important for applications to get accurate results
-  make install # sudo is needed if a system direction is chosen,
-  ```
-then the application can be built as
-
-```bash
-  mkdir appbuild
-  cd appbuild
-  # Please see below for CMake options
-  cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
-  make # IEEE=1 this option is important for applications to get accurate results
-  ```
-###### Tests
-
-A few tasks for testing codes can be run by
-```bash
-  make test
-  ```
-The current tests are mainly based on the applications.
-###### Options of interest to specify to `cmake` include:
-
-  * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
-  * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
-  * `-DOPS_TEST=ON` - enable the tests
-  * `-DCMAKE_INSTALL_PREFIX=` - specify the installation direction for the library (/usr/local by default, Library CMake only)
-  * `-DAPP_INSTALL_DIR=` - specify the installation direction for the applications ($HOME/OPS-APPS by default)
-  * `-DGPU_NUMBER=` - specify the number of GPUs used in the tests
-  * `-DOPS_INSTALL_DIR=` - specify where the OPS library is installed (Application CMake only, see [here](#build-the-library-and-example-applications-separately))
-  * `-DOPS_VERBOSE_WARNING=ON` - show verbose output during building process
-  <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
-  <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
+## Licence 
+OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
 

From 652512005af2819d7a7ceac034eb9a5c9f2efce5 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 11:41:46 +0100
Subject: [PATCH 205/324] Update README.md

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 68b548ecfd..6d63ce13cc 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
 # OPS
 
+[![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/build.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
+[![Documentation Status](https://ops-dsl.readthedocs.io/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)
+
 OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
 
 This repository contains the implementation of the back-end library and the code-generator, and is structured as follows:

From 52aff82a9bef791b17bbd7eebb9810e973f26f6f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:01:40 +0100
Subject: [PATCH 206/324] Update README.md

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6d63ce13cc..356f3338bf 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
 # OPS
 
-[![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/build.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
-[![Documentation Status](https://ops-dsl.readthedocs.io/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)
+[![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/pipeline.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
+[![Documentation Status](https://readthedocs.org/projects/ops-dsl/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)
+
 
 OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
 

From 17c6e709d37b259ae60745a22cc5466d336e8cff Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:03:22 +0100
Subject: [PATCH 207/324] Update README.md

---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 356f3338bf..2077d3fd19 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,10 @@
 # OPS
 
+OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
+
 [![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/pipeline.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
 [![Documentation Status](https://readthedocs.org/projects/ops-dsl/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)
 
-
-OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
-
 This repository contains the implementation of the back-end library and the code-generator, and is structured as follows:
 
 * `ops`: Implementation of the user and run-time OPS C/C++ APIs

From 88b31e5dd308410bb7cc78ebb2b0c866e8076c76 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:06:05 +0100
Subject: [PATCH 208/324] Update index.rst

---
 doc/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/index.rst b/doc/index.rst
index 73f406e704..f991bca140 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -3,7 +3,7 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to Test's documentation!
+Welcome to OPS documentation!
 ================================
 
 .. toctree::

From f74c009ff7936e029353d435110eb86b7581a6c0 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:18:31 +0100
Subject: [PATCH 209/324] Update installation.md

---
 doc/installation.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index f69ec8c08c..349277b097 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -86,8 +86,8 @@ The current tests are mainly based on the applications.
   * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
   * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
   * `-DOPS_TEST=ON` - enable the tests
-  * `-DCMAKE_INSTALL_PREFIX=` - specify the installation direction for the library (/usr/local by default, Library CMake only)
-  * `-DAPP_INSTALL_DIR=` - specify the installation direction for the applications ($HOME/OPS-APPS by default)
+  * `-DCMAKE_INSTALL_PREFIX=` - specify the installation direction for the library (`/usr/local` by default, Library CMake only)
+  * `-DAPP_INSTALL_DIR=` - specify the installation direction for the applications (`$HOME/OPS-APPS` by default)
   * `-DGPU_NUMBER=` - specify the number of GPUs used in the tests
   * `-DOPS_INSTALL_DIR=` - specify where the OPS library is installed (Application CMake only, see [here](#build-the-library-and-example-applications-separately))
   * `-DOPS_VERBOSE_WARNING=ON` - show verbose output during building process
@@ -104,7 +104,7 @@ The current tests are mainly based on the applications.
 
   * `OPS_COMPILER` - compiler to be used (Currently supports Intel, PGI and Cray compilers, but others can be easily incorporated by extending the Makefiles used in step 2 and 3)
   * `OPS_INSTALL_PATH` - Installation directory of OPS/ops
-  * `CUDA_INSTALL_PATH - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications)
+  * `CUDA_INSTALL_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications)
   * `OPENCL_INSTALL_PATH` - Installation directory of OpenCL, usually `/usr/local/cuda` for NVIDIA OpenCL implementation (to build OpenCL libs and applications)
   * `MPI_INSTALL_PATH` - Installation directory of MPI (to build MPI based distributed memory libs and applications)
   * `HDF5_INSTALL_PATH` - Installation directory of HDF5 (to support HDF5 based File I/O)

From 2cba8b0158b5d23146b632edc2b7b0a2613d6eb0 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:26:41 +0100
Subject: [PATCH 210/324] Update installation.md

---
 doc/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index 349277b097..c3ba25f83c 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -7,7 +7,7 @@
   **CMake**
 
 CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
-  ```bash
+  ```bash {r}
   version=3.19.0
   wget https://github.com/Kitware/CMake/releases/download/v$version/cmake-$version-Linux-x86_64.sh
   # Assume that CMake is going to be installed at /usr/local/cmake

From f7be66de1233d79b6c81db255c6665db7c1bec42 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:36:37 +0100
Subject: [PATCH 211/324] Update introduction.md

---
 doc/introduction.md | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index d7d6a499c3..4ac754d449 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -1,13 +1,29 @@
 # Introduction
 
-OPS is a high-level framework with associated libraries and
-preprocessors to generate parallel executables for applications on
-**multi-block structured grids**. Multi-block structured grids consists
-of an unstructured collection of structured meshes/grids. This document
-describes the OPS C++ API, which supports the development of
-single-block and multi-block structured meshes.
-
 ## Overview
+
+OPS is a high-level framework with associated libraries and preprocessors to generate parallel executables for applications on **multi-block structured grids**. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. This document describes the OPS C++ API, which supports the development of single-block and multi-block structured meshes.
+
 ## Licencing
+OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
+
 ## Citing
+To cite OPS, please reference the following paper:
+
+[I. Z. Reguly, G. R. Mudalige and M. B. Giles, Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS, in IEEE Transactions on Parallel and Distributed Systems, vol. 29, no. 4, pp. 873-886, 1 April 2018, doi: 10.1109/TPDS.2017.2778161.](https://ieeexplore.ieee.org/abstract/document/8121995)
+
+```
+@ARTICLE{Reguly_et_al_2018,
+  author={Reguly, István Z. and Mudalige, Gihan R. and Giles, Michael B.},
+  journal={IEEE Transactions on Parallel and Distributed Systems}, 
+  title={Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS}, 
+  year={2018},
+  volume={29},
+  number={4},
+  pages={873-886},
+  doi={10.1109/TPDS.2017.2778161}}
+```
+Full list of publications from the OPS project can be found in the [Publications](https://opensbli.readthedocs.io/en/latest/citing.html) section.
+
 ## Support
+The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by  contacting the [OP-DSL team](https://op-dsl.github.io/about.html). 

From a47dd0f7b77b12e5c7d959a2027c6b15eeceb9c2 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:37:36 +0100
Subject: [PATCH 212/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 4ac754d449..c35c0cac9d 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -23,7 +23,7 @@ To cite OPS, please reference the following paper:
   pages={873-886},
   doi={10.1109/TPDS.2017.2778161}}
 ```
-Full list of publications from the OPS project can be found in the [Publications](https://opensbli.readthedocs.io/en/latest/citing.html) section.
+Full list of publications from the OPS project can be found in the [Publications](https://ops-dsl.readthedocs.io/en/markdowndocdev/pubs.html) section.
 
 ## Support
 The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by  contacting the [OP-DSL team](https://op-dsl.github.io/about.html). 

From 33f08e391e8eff872491efb58afe03b73c9e6e9b Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:40:39 +0100
Subject: [PATCH 213/324] Update apps.md

---
 doc/apps.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/apps.md b/doc/apps.md
index efcea9dbfa..b023ec04ee 100644
--- a/doc/apps.md
+++ b/doc/apps.md
@@ -1,4 +1,4 @@
-# Example Applications
+# Examples
 ## CloverLeaf  (2D, 3D and HDF5)
 ## poisson
 ## adi

From 543faf49b6643bcb8224779bc633438309ec302a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:47:20 +0100
Subject: [PATCH 214/324] Update installation.md

---
 doc/installation.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/installation.md b/doc/installation.md
index c3ba25f83c..babf9e69e1 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -27,9 +27,16 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using `-DHDF5_ROOT`.
 
  **CUDA**
+The CUDA backend targets NVIDIA GPUs with a compute capability of 3.0 or greater.
 
 The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
+**HIP**
+The HIP backend targets AMD GPUs which are supported by the ROCm stack
+
+**SYCL**
+
+**Tridiagonal Solver**
 
 ## Obtaining OPS
 ```bash

From d5bdb3f20bc4f2858bb5e142c41037af11e7f664 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:47:33 +0100
Subject: [PATCH 215/324] Update installation.md

---
 doc/installation.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/installation.md b/doc/installation.md
index babf9e69e1..bb90d33c65 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -32,6 +32,7 @@ The CUDA backend targets NVIDIA GPUs with a compute capability of 3.0 or greater
 The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
 **HIP**
+
 The HIP backend targets AMD GPUs which are supported by the ROCm stack
 
 **SYCL**

From 5235355bd95cab0d47defe856daef70906bda9f9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:51:21 +0100
Subject: [PATCH 216/324] Update installation.md

---
 doc/installation.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index bb90d33c65..eac7ec076e 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -27,9 +27,8 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using `-DHDF5_ROOT`.
 
  **CUDA**
-The CUDA backend targets NVIDIA GPUs with a compute capability of 3.0 or greater.
-
-The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
+ 
+The [CUDA](https://developer.nvidia.com/cuda-downloads) backend targets NVIDIA GPUs with a compute capability of 3.0 or greater. The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
 **HIP**
 

From 1f7cf8daa5e7bf2a1b0e5e96513cb9bc999d20d0 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 12:58:33 +0100
Subject: [PATCH 217/324] Update installation.md

---
 doc/installation.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index eac7ec076e..f91be6b8c6 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -39,8 +39,9 @@ The HIP backend targets AMD GPUs which are supported by the ROCm stack
 **Tridiagonal Solver**
 
 ## Obtaining OPS
+The latest OPS source code can be obtained by cloning the [OPS repository](https://github.com/OP-DSL/OPS) using
 ```bash
-git clone https://github.com/gihanmudalige/OPS.git
+git clone https://github.com/OP-DSL/OPS.git
 ```
     
 ## Build OPS

From 9f136986194266f5e117264b007a0c821e030839 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 15:20:21 +0100
Subject: [PATCH 218/324] Update installation.md

---
 doc/installation.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/installation.md b/doc/installation.md
index f91be6b8c6..4967718e60 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -38,6 +38,12 @@ The HIP backend targets AMD GPUs which are supported by the ROCm stack
 
 **Tridiagonal Solver**
 
+To use the tridiagonal solver OPS API in applications and build example applications such as `adi`, `adi_burger` and `adi_burger_3D` the open source tridiagonal solver (scalar) library needs to be cloned and built from the [Tridsolver repository](https://github.com/OP-DSL/tridsolver). 
+```bash
+git clone https://github.com/OP-DSL/tridsolver.git
+```
+Details on building scalar tridiagonal solver library can be found in the [README](https://github.com/OP-DSL/tridsolver/blob/master/scalar/README) file located at the appropriate subdirectory.
+
 ## Obtaining OPS
 The latest OPS source code can be obtained by cloning the [OPS repository](https://github.com/OP-DSL/OPS) using
 ```bash

From 856d915ec6e61ad044268661cd5928e3cc9bcbb8 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 15:23:53 +0100
Subject: [PATCH 219/324] Update installation.md

---
 doc/installation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 4967718e60..ec6acbc0c8 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -60,7 +60,7 @@ git clone https://github.com/OP-DSL/OPS.git
   cd build
   # Please see below for CMake options
   cmake ${PATH_TO_OPS} -DBUILD_OPS_APPS=ON -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL -DGPU_NUMBER=1
-  make # IEEE=1 this option is important for applications to get accurate results
+  make # IEEE=1 enable IEEE flags in compiler
   make install # sudo is needed if a directory like /usr/local/ is chosen.
   ```
 After installation, the library and the python translator can be found at the direcory specified by `CMAKE_INSTALL_PREFIX`, together with the executable files for applications at `APP_INSTALL_DIR`.
@@ -74,7 +74,7 @@ mkdir build
 cd build
 # Please see below for CMake options
 cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
-make # IEEE=1 this option is important for applications to get accurate results
+make # IEEE=1 enable IEEE flags in compiler
 make install # sudo is needed if a system direction is chosen,
 ```
 then the application can be built as

From e4253a95dc88e501c226727b4fb5d3beb896172d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 15:51:16 +0100
Subject: [PATCH 220/324] Update introduction.md

---
 doc/introduction.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/doc/introduction.md b/doc/introduction.md
index c35c0cac9d..73497527aa 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -27,3 +27,8 @@ Full list of publications from the OPS project can be found in the [Publications
 
 ## Support
 The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by  contacting the [OP-DSL team](https://op-dsl.github.io/about.html). 
+
+## Funding
+The development of OPS was in part supported by the UK Engineering and Physical Sciences Research Council (EPSRC) grants [EP/K038567/1](http://gow.epsrc.ac.uk/NGBOViewGrant.aspx?GrantRef=EP/K038567/1) (“Future-proof massively-parallel execution of multi-block applications”), [EP/J010553/1](http://gow.epsrc.ac.uk/NGBOViewGrant.aspx?GrantRef=EP/J010553/1) (“Software for Emerging Architectures - ASEArch"), The UK Turbulence Consortium grant [EP/T026170/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/T026170/1), The Janos Bolyai Research Scholarship of the Hungarian Academy of Sciences, the Royal Society through their Industry Fellowship Scheme (INF/R1/180012), and the Thematic Research Cooperation Establishing Innovative Informatic and Info-communication Solutions Project, which has been supported by the European Union and co-financed by the European Social Fund under grant number EFOP-3.6.2-16-2017-00013. Research funding support was also provided by the UK AWE under grants CDK0660 ("The Production of Predictive Models for Future Computing Requirements"), CDK0724 ("AWE Technical Outreach Programme"), AWE grant for "High-level Abstractions for Performance, Portability and Continuity of Scientific Software on Future Computing Systems" and the Numerical Algorithms Group [NAG](https://www.nag.com/).
+
+Hardware resources for development and testing provided by the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725, the [ARCHER](http://www.archer.ac.uk) and ARCHER2(https://www.archer2.ac.uk/) UK National Supercomputing Service, [University of Oxford Advanced Research Computing (ARC) facility](http://dx.doi.org/10.5281/zenodo.22558) and through hardware donations and access provided by NVIDIA and Intel.

From 3ca8fa593f0762a5058329301849b9ef11643fb8 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 15:59:29 +0100
Subject: [PATCH 221/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 73497527aa..e50ff7d19c 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -29,6 +29,6 @@ Full list of publications from the OPS project can be found in the [Publications
 The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by  contacting the [OP-DSL team](https://op-dsl.github.io/about.html). 
 
 ## Funding
-The development of OPS was in part supported by the UK Engineering and Physical Sciences Research Council (EPSRC) grants [EP/K038567/1](http://gow.epsrc.ac.uk/NGBOViewGrant.aspx?GrantRef=EP/K038567/1) (“Future-proof massively-parallel execution of multi-block applications”), [EP/J010553/1](http://gow.epsrc.ac.uk/NGBOViewGrant.aspx?GrantRef=EP/J010553/1) (“Software for Emerging Architectures - ASEArch"), The UK Turbulence Consortium grant [EP/T026170/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/T026170/1), The Janos Bolyai Research Scholarship of the Hungarian Academy of Sciences, the Royal Society through their Industry Fellowship Scheme (INF/R1/180012), and the Thematic Research Cooperation Establishing Innovative Informatic and Info-communication Solutions Project, which has been supported by the European Union and co-financed by the European Social Fund under grant number EFOP-3.6.2-16-2017-00013. Research funding support was also provided by the UK AWE under grants CDK0660 ("The Production of Predictive Models for Future Computing Requirements"), CDK0724 ("AWE Technical Outreach Programme"), AWE grant for "High-level Abstractions for Performance, Portability and Continuity of Scientific Software on Future Computing Systems" and the Numerical Algorithms Group [NAG](https://www.nag.com/).
+The development of OPS was in part supported by the UK Engineering and Physical Sciences Research Council (EPSRC) grants [EP/K038494/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/K038494/1) (“Future-proof massively-parallel execution of multi-block applications”), [EP/J010553/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/J010553/1) (“Software for Emerging Architectures - ASEArch"), The UK Turbulence Consortium grant [EP/T026170/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/T026170/1), The Janos Bolyai Research Scholarship of the Hungarian Academy of Sciences, the Royal Society through their Industry Fellowship Scheme (INF/R1/180012), and the Thematic Research Cooperation Establishing Innovative Informatic and Info-communication Solutions Project, which has been supported by the European Union and co-financed by the European Social Fund under grant number EFOP-3.6.2-16-2017-00013. Research funding support was also provided by the UK AWE under grants CDK0660 ("The Production of Predictive Models for Future Computing Requirements"), CDK0724 ("AWE Technical Outreach Programme"), AWE grant for "High-level Abstractions for Performance, Portability and Continuity of Scientific Software on Future Computing Systems" and the Numerical Algorithms Group [NAG](https://www.nag.com/).
 
 Hardware resources for development and testing provided by the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725, the [ARCHER](http://www.archer.ac.uk) and ARCHER2(https://www.archer2.ac.uk/) UK National Supercomputing Service, [University of Oxford Advanced Research Computing (ARC) facility](http://dx.doi.org/10.5281/zenodo.22558) and through hardware donations and access provided by NVIDIA and Intel.

From 8d2e1c005b0d6074a622b0d968d646ee9f0f9d4a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:12:23 +0100
Subject: [PATCH 222/324] Update setup.py

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 0b0fbbc774..94c56f8368 100644
--- a/setup.py
+++ b/setup.py
@@ -5,8 +5,8 @@
 setup(name='ops',
       version='dev',
       description='OPS is an API with associated libraries and preprocessors to generate parallel executables for applications on mulit-block structured meshes.',
-      author='Mike Giles, Istvan Reguly, Gihan Mudalige, and others',
-      url='http://www.oerc.ox.ac.uk/projects/ops',
+      author='Gihan Mudalige, Istvan Reguly, Mike Giles, and others',
+      url='https://op-dsl.github.io/',
       packages=['ops_translator', 'ops_translator.c', 'ops_translator.fortran'],
       scripts=[],
       classifiers=[

From 2402be6c67279fa75be1e5e872fed69585828f50 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:13:12 +0100
Subject: [PATCH 223/324] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2077d3fd19..781adcf87e 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,6 @@ To cite OPS, please reference the following paper:
 ## Contact
 If you wish to report a bug with the software, please contact the [OP-DSL team](https://op-dsl.github.io/about.html)
 
-## Licence 
+## License 
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
 

From 4a6c9275cd2c36f9d0c1ba869beb44b285bb9b03 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:17:35 +0100
Subject: [PATCH 224/324] Update apps.md

---
 doc/apps.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/apps.md b/doc/apps.md
index b023ec04ee..4361e0ffde 100644
--- a/doc/apps.md
+++ b/doc/apps.md
@@ -1,4 +1,5 @@
 # Examples
-## CloverLeaf  (2D, 3D and HDF5)
+## CloverLeaf 2D, 
+## CloverLeaf 3D with HDF5
 ## poisson
 ## adi

From f580f87aea546db39c99c6adc1a01a153e294a82 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:17:44 +0100
Subject: [PATCH 225/324] Update apps.md

---
 doc/apps.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/apps.md b/doc/apps.md
index 4361e0ffde..5bee74c5de 100644
--- a/doc/apps.md
+++ b/doc/apps.md
@@ -1,5 +1,5 @@
 # Examples
-## CloverLeaf 2D, 
+## CloverLeaf 2D 
 ## CloverLeaf 3D with HDF5
 ## poisson
 ## adi

From e21436b0430a7074a5b6e354d47dd4b0a33da54b Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:27:03 +0100
Subject: [PATCH 226/324] Create perf.md

---
 doc/perf.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 doc/perf.md

diff --git a/doc/perf.md b/doc/perf.md
new file mode 100644
index 0000000000..3da5366b8a
--- /dev/null
+++ b/doc/perf.md
@@ -0,0 +1 @@
+Performance Tuning

From 21dc44d5e2cedaa0342c4c142ff84764731f8c8b Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:27:22 +0100
Subject: [PATCH 227/324] Update index.rst

---
 doc/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/index.rst b/doc/index.rst
index f991bca140..39efe81875 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -15,6 +15,7 @@ Welcome to OPS documentation!
    devanapp.md
    opsapi.md
    apps.md
+   perf.md
    devdoc.md
    pubs.md
 

From 69d2bbd18935c32ac660ddb6839ca87d36d0a574 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:33:14 +0100
Subject: [PATCH 228/324] Update perf.md

---
 doc/perf.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index 3da5366b8a..8b0dccc449 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -1 +1,9 @@
-Performance Tuning
+# Performance Tuning
+
+## Compiler flags for vectorization
+## Cache-blocking Tiling
+## OpenMP with MPI
+## CUDA arguments
+## CUDA-aware MPI
+## OpenCL arguments 
+

From a600a122d7aa81466cc9fc6852256092ac88a395 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 16:49:11 +0100
Subject: [PATCH 229/324] Update perf.md

---
 doc/perf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index 8b0dccc449..40a2e43fe9 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -1,6 +1,6 @@
 # Performance Tuning
 
-## Compiler flags for vectorization
+## Vectorization
 ## Cache-blocking Tiling
 ## OpenMP with MPI
 ## CUDA arguments

From bba8a4c311c002ee6b4a4c5c7e84068ce2626db1 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 17:01:06 +0100
Subject: [PATCH 230/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index e50ff7d19c..3ccdb1db21 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-OPS is a high-level framework with associated libraries and preprocessors to generate parallel executables for applications on **multi-block structured grids**. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. This document describes the OPS C++ API, which supports the development of single-block and multi-block structured meshes.
+[OPS](https://github.com/OP-DSL/OPS)(Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
 
 ## Licencing
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.

From a867d6f759c482e6f50db3165c0f16781e280fb8 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Sat, 25 Sep 2021 17:03:44 +0100
Subject: [PATCH 231/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 3ccdb1db21..7487a62c9c 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-[OPS](https://github.com/OP-DSL/OPS)(Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
 
 ## Licencing
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.

From 3377be5f0fce03dd71ed4d23e12fb63604890c58 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 11:41:14 +0100
Subject: [PATCH 232/324] Update README.md

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index 781adcf87e..059e57a2f1 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,17 @@ To cite OPS, please reference the following paper:
 ## Contact
 If you wish to report a bug with the software, please contact the [OP-DSL team](https://op-dsl.github.io/about.html)
 
+## Contributing
+
+To contribute to OPS please use the following steps :
+
+1. Clone this repository (on your local system)
+2. Create a new branch in your cloned repository
+3. Make changes / contributions in your new branch
+4. Submit your changes by creating a Pull Request to the `develop` branch of the OPS repository
+
+The contributions in the `develop` branch will be merged into the master branch as we create a new release.
+
 ## License 
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
 

From ce3a4bc4c86f29b6c899fee3de39e81c90330f05 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 11:43:30 +0100
Subject: [PATCH 233/324] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 059e57a2f1..9c0555f554 100644
--- a/README.md
+++ b/README.md
@@ -37,8 +37,8 @@ To cite OPS, please reference the following paper:
   doi={10.1109/TPDS.2017.2778161}}
 ```
 
-## Contact
-If you wish to report a bug with the software, please contact the [OP-DSL team](https://op-dsl.github.io/about.html)
+## Support and Contact
+The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by contacting the the [OP-DSL team](https://op-dsl.github.io/about.html).
 
 ## Contributing
 

From 2a51e0e330ec65e74cb2134ca83ad8b4663cb477 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 12:15:43 +0100
Subject: [PATCH 234/324] Update opsapi.md

---
 doc/opsapi.md | 54 ++++++++++++++++++---------------------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 408bf9e341..13712c3517 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -1,44 +1,28 @@
 # OPS API
 
+## Overview
+
+Many of the API and library follows the structure of the OP2 high-level library for unstructured mesh
+applications~\cite{op2}.
+
+The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The key idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
+
+To clarify some of the important issues in the API, we note here some needs connected with a 3D application:
+*   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
+*   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
+*   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
+*   In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
+*   In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block. 
+
 ## Key concepts and structure
 
-An OPS application can generally be divided into two key parts:
-initialisation and parallel execution. During the initialisation phase,
-one or more blocks (ops_block) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a
-block, and have a specific size (in each dimension of the block), which
-may be slightly different across different datasets (e.g. staggered
-grids), in some directions they may be degenerate (a size of 1), or they
-can represent data associated with different multigrid levels (where
-their size if a multiple or a fraction of other datasets). Datasets can
-be declared with empty (NULL) pointers, then OPS will allocate the
-appropriate amount of memory, may be passed non-NULL pointers (currently
-only supported in non-MPI environments), in which case OPS will assume
-the memory is large enough for the data and the block halo, and there
-are HDF5 dataset declaration routines which allow the distributed
-reading of datasets from HDF5 files. The concept of blocks is necessary
-to group datasets together, as in a multi-block problem, in a
-distributed memory environment, OPS needs to be able to determine how to
+An OPS application can generally be divided into two key parts: (1) initialisation and (2) parallel execution. During the initialisation phase, one or more blocks ( we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
 decompose the problem.
 
-The initialisation phase usually also consists of defining the stencils
-to be used later on (though they can be defined later as well), which
-describe the data access patterns used in parallel loops. Stencils are
-always relative to the "current" point; e.g. if at iteration $(i,j)$, we
-wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two
-points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in
-one of the dimensions the dataset's size is 1), as well as for
-multigrid, there are special strided, restriction, and prolongation
-stencils: they differ from normal stencils in that as one steps through
-a grid in a parallel loop, the stepping is done with a non-unit stride
-for these datasets. For example, in a 2D problem, if we have a
-degenerate dataset called xcoords, size $(N,1)$, then we will need a
-stencil with stride $(1,0)$ to access it in a regular 2D loop.
-
-Finally, the initialisation phase may declare a number of global
-constants - these are variables in global scope that can be accessed
-from within user kernels, without having to pass them in explicitly.
-These may be scalars or small arrays, generally for values that do not
-change during execution, though they may be updated during execution
+The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration $(i,j)$, we wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size $(N,1)$, then we will need a stencil with stride $(1,0)$ to access it in a regular 2D loop.
+
+Finally, the initialisation phase may declare a number of global constants - these are variables in global scope that can be accessed from within elemental kernels, without having to pass them in explicitly. These may be scalars or small arrays, generally for values that do not change during execution, though they may be updated during execution
 with repeated calls to `ops_decl_const`.
 
 The initialisation phase is terminated by a call to `ops_partition`.

From ae6569d1ce1eaf2464940dcad71e1a2842bb8f5d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 12:43:10 +0100
Subject: [PATCH 235/324] Update opsapi.md

---
 doc/opsapi.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 13712c3517..08cb669984 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -2,9 +2,6 @@
 
 ## Overview
 
-Many of the API and library follows the structure of the OP2 high-level library for unstructured mesh
-applications~\cite{op2}.
-
 The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The key idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
 
 To clarify some of the important issues in the API, we note here some needs connected with a 3D application:

From 718ceb28c57bb413441876c2703317f9b93f1591 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 12:44:59 +0100
Subject: [PATCH 236/324] Update opsapi.md

---
 doc/opsapi.md | 41 +++++++----------------------------------
 1 file changed, 7 insertions(+), 34 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 08cb669984..ca1d41b542 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -4,13 +4,6 @@
 
 The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The key idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
 
-To clarify some of the important issues in the API, we note here some needs connected with a 3D application:
-*   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
-*   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
-*   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
-*   In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
-*   In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block. 
-
 ## Key concepts and structure
 
 An OPS application can generally be divided into two key parts: (1) initialisation and (2) parallel execution. During the initialisation phase, one or more blocks ( we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
@@ -98,34 +91,14 @@ blocks can be quite complex, and in particular they may not be oriented
 in the same way, i.e. an $i,j$ face of one block may correspond to the
 $j,k$ face of another block. This is awkward and hard to handle simply.
 
-To clarify some of the important issues in designing the API, we note
-here some needs connected with a 3D application:
-
--   When looping over the interior with loop indices $i,j,k$, often
-    there are 1D arrays which are referenced using just one of the
-    indices.
-
--   To implement boundary conditions, we often loop over a 2D face,
-    accessing both the 3D dataset and data from a 2D dataset.
-
--   To implement periodic boundary conditions using dummy "halo" points,
-    we sometimes have to copy one plane of boundary data to another.
-    e.g. if the first dimension has size $I$ then we might copy the
-    plane $i=I\!-\!2$ to plane $i=0$, and plane $i=1$ to plane
-    $i=I\!-\!1$.
-
--   In multigrid, we are working with two grids with one having twice as
-    many points as the other in each direction. To handle this we
-    require a stencil with a non-unit stride.
-
--   In multi-block grids, we have several structured blocks. The
-    connectivity between the faces of different blocks can be quite
-    complex, and in particular they may not be oriented in the same way,
-    i.e. an $i,j$ face of one block may correspond to the $j,k$ face of
-    another block. This is awkward and hard to handle simply.
+To clarify some of the important issues in the API, we note here some needs connected with a 3D application:
+*   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
+*   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
+*   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
+*   In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
+*   In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block. 
 
-The latest proposal is to handle all of these different requirements
-through stencil definitions.
+OPS handle all of these different requirements through stencil definitions.
 
 ## OPS C++ API 
 

From bc7396ecc175bef1f2c4ef65ff6e84d37268cb2f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 12:58:26 +0100
Subject: [PATCH 237/324] Update opsapi.md

---
 doc/opsapi.md | 84 ++++++++-------------------------------------------
 1 file changed, 13 insertions(+), 71 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index ca1d41b542..4f13c141e7 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -17,81 +17,23 @@ with repeated calls to `ops_decl_const`.
 
 The initialisation phase is terminated by a call to `ops_partition`.
 
-The bulk of the application consists of parallel loops, implemented
-using calls to `ops_par_loop`. These constructs work with datasets,
-passed through the opaque `ops_dat` handles declared during the
-initialisation phase. The iterations of parallel loops are semantically
-independent, and it is the responsibility of the user to enforce this:
-the order in which iterations are executed cannot affect the result
-(within the limits of floating point precision). Parallel loops are
-defined on a block, with a prescribed iteration range that is always
-defined from the perspective of the dataset written/modified (the sizes
-of datasets, particularly in multigrid situations, may be very
-different). Datasets are passed in using `ops_arg_dat`, and during
-execution, values at the current grid point will be passed to the user
-kernel. These values are passed wrapped in a templated `ACC<>` object
-(templated on the type of the data), whose parentheses operator is
-overloaded, which the user must use to specify the relative offset to
-access the grid point's neighbours (which accesses have to match the the
-declared stencil). Datasets written may only be accessed with a
-one-point, zero-offset stencil (otherwise the parallel semantics may be
-violated).
-
-Other than datasets, one can pass in read-only scalars or small arrays
-that are iteration space invariant with `ops_arg_gbl` (typically
-weights, $\delta t$, etc. which may be different in different loops).
-The current iteration index can also be passed in with `ops_arg_idx`,
-which will pass a globally consistent index to the user kernel (i.e.
+The bulk of the application consists of parallel loops, implemented using calls to `ops_par_loop`. These constructs work with datasets, passed through the opaque `ops_dat` handles declared during the initialisation phase. The iterations of parallel loops are semantically independent, and it is the responsibility of the user to enforce this:
+the order in which iterations are executed cannot affect the result (within the limits of floating point precision). Parallel loops are defined on a block, with a prescribed iteration range that is always defined from the perspective of the dataset written/modified (the sizes of datasets, particularly in multigrid situations, may be very
+different). Datasets are passed in using `ops_arg_dat`, and during execution, values at the current grid point will be passed to the user kernel. These values are passed wrapped in a templated `ACC<>` object (templated on the type of the data), whose parentheses operator is overloaded, which the user must use to specify the relative offset to
+access the grid point's neighbours (which accesses have to match the the declared stencil). Datasets written may only be accessed with a one-point, zero-offset stencil (otherwise the parallel semantics may be violated).
+
+Other than datasets, one can pass in read-only scalars or small arrays that are iteration space invariant with `ops_arg_gbl` (typically weights, $\delta t$, etc. which may be different in different loops). The current iteration index can also be passed in with `ops_arg_idx`, which will pass a globally consistent index to the user kernel (i.e.
 also under MPI).
 
-Reductions in loops are done using the ops_arg_reduce argument, which
-takes a reduction handle as an argument. The result of the reduction can
-then be acquired using a separate call to `ops_reduction_result`. The
-semantics are the following: a reduction handle after it was declared is
-in an "uninitialised" state. The first time it is used as an argument to
-a loop, its type is determined (increment/min/max), and is initialised
-appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in
-parallel loops are combined together, up until the point, where the
-result is acquired using `ops_reduction_result`, which then sets it back
-to an uninitialised state. This also implies, that different parallel
-loops, which all use the same reduction handle, but are otherwise
-independent, are independent and their partial reduction results can be
-combined together associatively and commutatively.
-
-OPS takes responsibility for all data, its movement and the execution of
-parallel loops. With different execution hardware and optimisations,
-this means OPS will re-organise data as well as execution (potentially
-across different loops), and therefore any data accesses or manipulation
-may only be done through the OPS API.
-
-This restriction is exploited by a lazy execution mechanism in OPS. The
-idea is that OPS API calls that do not return a result can be not
-executed immediately, rather queued, and once an API call requires
-returning some data, operations in the queue are executed, and the
-result is returned. This allows OPS to analyse and optimise operations
-in the queue together. This mechanism is fully automated by OPS, and is
-used with the various \_tiled executables. For more information on how
-to use this mechanism for improving CPU performance, see Section
-[\[sec:tiling\]](#sec:tiling){reference-type="ref"
-reference="sec:tiling"}. Some API calls triggering the execution of
-queued operations include ops_reduction_result, and the functions in the
-data access API.
+Reductions in loops are done using the `ops_arg_reduce` argument, which takes a reduction handle as an argument. The result of the reduction can then be acquired using a separate call to `ops_reduction_result`. The semantics are the following: a reduction handle after it was declared is in an "uninitialised" state. The first time it is used as an argument to a loop, its type is determined (increment/min/max), and is initialised appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in parallel loops are combined together, up until the point, where the result is acquired using `ops_reduction_result`, which then sets it back to an uninitialised state. This also implies, that different parallel loops, which all use the same reduction handle, but are otherwise independent, are independent and their partial reduction results can be combined together associatively and commutatively.
 
+OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **any data accesses or manipulation must only be done through the OPS API**. 
+
+This restriction is exploited by a lazy execution mechanism in OPS. The idea is that OPS API calls that do not return a result need not be executed immediately, rather queued, and once an API call requires returning some data, operations in the queue are executed, and the result is returned. This allows OPS to analyse and optimise operations
+in the queue together. This mechanism is fully automated by OPS, and is used with the various `_tiled` executables. For more information on how to use this mechanism for improving CPU performance, see Section on Tiling. Some API calls triggering the execution of queued operations include `ops_reduction_result`, and the functions in the
+data access API.
 
-Many of the API and library follows the structure of the OP2 high-level
-library for unstructured mesh applications [@op2]. However the
-structured mesh domain is distinct from the unstructured mesh
-applications domain due to the implicit connectivity between
-neighbouring mesh elements (such as vertices, cells) in structured
-meshes/grids. The key idea is that operations involve looping over a
-"rectangular" multi-dimensional set of grid points using one or more
-"stencils" to access data. In multi-block grids, we have several
-structured blocks. The connectivity between the faces of different
-blocks can be quite complex, and in particular they may not be oriented
-in the same way, i.e. an $i,j$ face of one block may correspond to the
-$j,k$ face of another block. This is awkward and hard to handle simply.
-
-To clarify some of the important issues in the API, we note here some needs connected with a 3D application:
+To clarify some of the important issues encountered when designing the OPS API, we note here some needs connected with a 3D application:
 *   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
 *   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
 *   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.

From c7c2f3cefa9115686de766f718827302cc68ff10 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 12:59:48 +0100
Subject: [PATCH 238/324] Update opsapi.md

---
 doc/opsapi.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 4f13c141e7..f5fb04fe29 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -4,7 +4,7 @@
 
 The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The key idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
 
-## Key concepts and structure
+## Key Concepts and Structure
 
 An OPS application can generally be divided into two key parts: (1) initialisation and (2) parallel execution. During the initialisation phase, one or more blocks ( we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
 decompose the problem.
@@ -33,7 +33,7 @@ This restriction is exploited by a lazy execution mechanism in OPS. The idea is
 in the queue together. This mechanism is fully automated by OPS, and is used with the various `_tiled` executables. For more information on how to use this mechanism for improving CPU performance, see Section on Tiling. Some API calls triggering the execution of queued operations include `ops_reduction_result`, and the functions in the
 data access API.
 
-To clarify some of the important issues encountered when designing the OPS API, we note here some needs connected with a 3D application:
+To further clarify some of the important issues encountered when designing the OPS API, we note here some needs connected with a 3D application:
 *   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
 *   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
 *   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.

From f80f2e87cc437fca66735f8a995a74c964e98f53 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 13:15:02 +0100
Subject: [PATCH 239/324] Update opsapi.md

---
 doc/opsapi.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index f5fb04fe29..a2771d3dbf 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -43,4 +43,10 @@ To further clarify some of the important issues encountered when designing the O
 OPS handle all of these different requirements through stencil definitions.
 
 ## OPS C++ API 
-
+### Initialisation declaration and termination routines
+### Diagnostic and output routines
+### Halo exchange
+### Parallel loop syntax
+### Stencils
+### Checkpointing
+### Access to OPS data

From 2b7281875f879fbc52733cb14a64277da6952a29 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 13:17:37 +0100
Subject: [PATCH 240/324] Update opsapi.md

---
 doc/opsapi.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index a2771d3dbf..abeed444cf 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -43,7 +43,8 @@ To further clarify some of the important issues encountered when designing the O
 OPS handle all of these different requirements through stencil definitions.
 
 ## OPS C++ API 
-### Initialisation declaration and termination routines
+### Initialisation and termination routines
+### Declaration routines
 ### Diagnostic and output routines
 ### Halo exchange
 ### Parallel loop syntax

From e739adf847b8438309937f8810c6a745202fe1ce Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:31:54 +0100
Subject: [PATCH 241/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 7487a62c9c..ac9e9b2da0 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures. Multi-block structured grids consists of an unstructured collection of structured meshes/grids. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
 
 ## Licencing
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.

From d5f95a5f58b8a924ad5b5232f34ac822cac4a47d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:33:10 +0100
Subject: [PATCH 242/324] Update README.md

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9c0555f554..bec553a5ac 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # OPS
 
-OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing multi-block structured mesh algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation of the high-level code on multi-core and many-core architectures.
+OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran.
+
 
 [![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/pipeline.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
 [![Documentation Status](https://readthedocs.org/projects/ops-dsl/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)

From ef7b56ef1585487c5ca7deeeccc310ede8959fb2 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:34:01 +0100
Subject: [PATCH 243/324] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bec553a5ac..89ddd03dd3 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # OPS
 
-OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran.
+OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes. The OPS API is embedded in C/C++ and Fortran.
 
 
 [![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/pipeline.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 

From 887504cafdca6b338c8a6f771085f7c31f766630 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:37:43 +0100
Subject: [PATCH 244/324] Update opsapi.md

---
 doc/opsapi.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index abeed444cf..9bf6144ebb 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The key idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
+The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The main idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
 
 ## Key Concepts and Structure
 

From 14251b6c3ff5f141c29f032ce0c981329fb67a7b Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:42:44 +0100
Subject: [PATCH 245/324] Update opsapi.md

---
 doc/opsapi.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 9bf6144ebb..956e5c7323 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -6,7 +6,7 @@ The key characteristic of structured mesh applications is the implicit connectiv
 
 ## Key Concepts and Structure
 
-An OPS application can generally be divided into two key parts: (1) initialisation and (2) parallel execution. During the initialisation phase, one or more blocks ( we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
+The OPS API allows to declare a computation over such multi-block structured meshes. An OPS application can generally be declared in two key parts: (1) initialisation and (2) iteration over the mesh (carried out as a parallel loop). During the initialisation phase, one or more blocks (we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
 decompose the problem.
 
 The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration $(i,j)$, we wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride

From bb77bab477a374523894152325dd55895f856ea2 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 27 Sep 2021 14:47:21 +0100
Subject: [PATCH 246/324] Update opsapi.md

---
 doc/opsapi.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 956e5c7323..3e884e3589 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -2,7 +2,7 @@
 
 ## Overview
 
-The key characteristic of structured mesh applications is the implicit connectivity between neighbouring mesh elements (such as vertices, cells). The main idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
+The key characteristic of structured mesh applications is the implicit connectivity between neighboring mesh elements (such as vertices, cells). The main idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
 
 ## Key Concepts and Structure
 

From 9b563c7ddfd596605a244822b28446ee05eb4dfe Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Wed, 29 Sep 2021 12:42:19 +0100
Subject: [PATCH 247/324] Fix a few violation to good markdown rules

---
 doc/opsapi.md | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 3e884e3589..5b15f651f1 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -27,27 +27,36 @@ also under MPI).
 
 Reductions in loops are done using the `ops_arg_reduce` argument, which takes a reduction handle as an argument. The result of the reduction can then be acquired using a separate call to `ops_reduction_result`. The semantics are the following: a reduction handle after it was declared is in an "uninitialised" state. The first time it is used as an argument to a loop, its type is determined (increment/min/max), and is initialised appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in parallel loops are combined together, up until the point, where the result is acquired using `ops_reduction_result`, which then sets it back to an uninitialised state. This also implies, that different parallel loops, which all use the same reduction handle, but are otherwise independent, are independent and their partial reduction results can be combined together associatively and commutatively.
 
-OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **any data accesses or manipulation must only be done through the OPS API**. 
+OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **any data accesses or manipulation must only be done through the OPS API**.
 
 This restriction is exploited by a lazy execution mechanism in OPS. The idea is that OPS API calls that do not return a result need not be executed immediately, rather queued, and once an API call requires returning some data, operations in the queue are executed, and the result is returned. This allows OPS to analyse and optimise operations
 in the queue together. This mechanism is fully automated by OPS, and is used with the various `_tiled` executables. For more information on how to use this mechanism for improving CPU performance, see Section on Tiling. Some API calls triggering the execution of queued operations include `ops_reduction_result`, and the functions in the
 data access API.
 
 To further clarify some of the important issues encountered when designing the OPS API, we note here some needs connected with a 3D application:
-*   When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
-*   To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
-*   To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
-*   In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
-*   In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block. 
+
+* When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
+* To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
+* To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
+* In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
+* In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block.
 
 OPS handle all of these different requirements through stencil definitions.
 
-## OPS C++ API 
+## C/C++ API
+
 ### Initialisation and termination routines
+
 ### Declaration routines
+
 ### Diagnostic and output routines
+
 ### Halo exchange
+
 ### Parallel loop syntax
+
 ### Stencils
+
 ### Checkpointing
+
 ### Access to OPS data

From 752a56db60401e61007747f747e11f5c133c0dc3 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sat, 9 Oct 2021 21:36:58 +0100
Subject: [PATCH 248/324] Port latex doc to markdown

---
 doc/perf.md | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index 40a2e43fe9..be069ba51b 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -1,9 +1,56 @@
 # Performance Tuning
 
 ## Vectorization
+
+## Executing with GPUDirect
+
+GPU direct support for MPI+CUDA, to enable (on the OPS side) add
+**-gpudirect** when running the executable. You may also have to use
+certain environmental flags when using different MPI distributions. For
+an example of the required flags and environmental settings on the
+Cambridge Wilkes2 GPU cluster see:\
+<https://docs.hpc.cam.ac.uk/hpc/user-guide/performance-tips.html>
 ## Cache-blocking Tiling
+OPS has a code generation (ops_gen_mpi_lazy) and build target for
+tiling. Once compiled, to enable, use the `OPS_TILING` runtime parameter. This will look at the L3 cache size of your CPU and guess the correct
+tile size. If you want to alter the amount of cache to be used for the
+guess, use the ``OPS_CACHE_SIZE=XX`` runtime parameter, where the value is
+in Megabytes. To manually specify the tile sizes, use the
+``OPS_TILESIZE_X``, ``OPS_TILESIZE_Y``, and ``OPS_TILESIZE_Z`` runtime arguments.
+
+When MPI is combined with OpenMP tiling can be extended to the MPI
+halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
+halos for multiple `ops_par_loops` can be exchanged with a single MPI
+message (see [@TPDS2017] for more details)\
+To test, compile CloverLeaf under ``apps/c/CloverLeaf``, modify clover.in
+to use a $6144^2$ mesh, then run as follows:\
+For OpenMP with tiling:
+```bash
+export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING
+```
+For MPI+OpenMP with tiling:
+```bash
+export OMP_NUM_THREADS=xx; mpirun -np xx ./cloverleaf_mpi_tiled OPS_TILING OPS_TILING_MAXDEPTH=6
+```
+To manually specify the tile sizes (in number of grid points), use the
+OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:
+```bash
+export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200
+```
 ## OpenMP with MPI
 ## CUDA arguments
+The CUDA (and OpenCL) thread block sizes can be controlled by setting
+the ``OPS_BLOCK_SIZE_X``, ``OPS_BLOCK_SIZE_Y`` and ``OPS_BLOCK_SIZE_Z`` runtime
+arguments. For example,
+```bash
+./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4
+```
 ## CUDA-aware MPI
-## OpenCL arguments 
+## OpenCL arguments
+
+`OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the
+code on.
+
+Usually `OPS_CL_DEVICE=0` selects the CPU and `OPS_CL_DEVICE=1` selects
+GPUs.
 

From af719db4571e0c5314999e313a6558d90c9175c8 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sat, 9 Oct 2021 21:37:46 +0100
Subject: [PATCH 249/324] port latex doc (API) to markdown

---
 doc/opsapi.md | 618 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 618 insertions(+)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 5b15f651f1..f9c91351fb 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -47,16 +47,634 @@ OPS handle all of these different requirements through stencil definitions.
 
 ### Initialisation and termination routines
 
+#### ops_init
+
+__void ops_init(int argc, char** argv, int diags_level)__
+
+This routine must be called before all other OPS routines
+
+| Arguments      | Description |
+| ----------- | ----------- |
+| argc, argv      | the usual command line arguments      |
+| diags_level   |  an integer which defines the level of debugging diagnostics and reporting to be performed |
+
+Currently, higher diags_levels does the following checks
+
+`diags_level` $=$ 1 : no diagnostics, default to achieve best runtime
+performance.
+
+`diags_level` $>$ 1 : print block decomposition and `ops_par_loop`
+timing breakdown.
+
+`diags_level` $>$ 4 : print intra-block halo buffer allocation feedback
+(for OPS internal development only)
+
+`diags_level` $>$ 5 : check if intra-block halo MPI sends depth match
+MPI receives depth (for OPS internal development only)
+
+#### ops_exit
+
+__void ops_exit()__
+
+This routine must be called last to cleanly terminate the OPS computation.
+
 ### Declaration routines
 
+#### ops_decl_block
+
+__ops_block ops_decl_block(int dims, char *name)__
+
+This routine defines a structured grid block.
+| Arguments      | Description |
+| ----------- | ----------- |
+| dims    | dimension of the block    |
+| name  |  a name used for output diagnostics |
+
+#### ops_decl_block_hdf5
+
+__ops_block ops_decl_block_hdf5(int dims, char *name, char *file)__
+
+This routine reads the details of a structured grid block from a named HDF5 file
+
+| Arguments      | Description |
+| ----------- | ----------- |
+| dims    | dimension of the block    |
+| name  |  a name used for output diagnostics |
+| file |hdf5 file to read and obtain the block information from|
+
+Although this routine does not read in any extra information about the
+block from the named HDF5 file than what is already specified in the
+arguments, it is included here for error checking (e.g. check if blocks
+defined in an HDF5 file is matching with the declared arguments in an
+application) and completeness.
+
+#### ops_decl_dat
+
+__ops_dat ops_decl_dat(ops block block, int dim, int *size, int *base, int *dm, int *d p, T *data, char *type, char *name)__
+
+This routine defines a dataset.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+block   |      structured block |
+dim     |      dimension of dataset (number of items per grid element) |
+size    |  size in each dimension of the block |
+base    |  base indices in each dimension of the block |
+d_m    |  padding from the face in the negative direction for each dimension (used for block halo) |
+d_p    |  padding from the face in the positive direction for each dimension (used for block halo) |
+data    |     input data of type *T* |
+type     |     the name of type used for output diagnostics (e.g. ``double``,``float``)|
+name     |     a name used for output diagnostics|
+
+The `size` allows to declare different sized data arrays on a given
+`block`. `d_m` and `d_p` are depth of the "block halos" that are used to
+indicate the offset from the edge of a block (in both the negative and
+positive directions of each dimension).
+
+#### ops_decl_dat_hdf5
+
+__ops_dat ops_decl_dat_hdf5(ops_block block, int dim, char *type, char *name, char *file)__
+
+This routine defines a dataset to be read in from a named hdf5 file
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|block  |   structured block|
+|dim     |  dimension of dataset (number of items per grid element)|
+type    |  the name of type used for output diagnostics (e.g. ``double``,``float``)|
+|name   |   name of the dat used for output diagnostics|
+|file   |   hdf5 file to read and obtain the data from|
+
+#### ops_decl_const
+
+__void ops_decl_const(char const * name, int dim, char const * type, T * data )__
+
+This routine defines a global constant: a variable in global scope. Global constants need to be declared upfront
+ so that they can be correctly handled for different parallelizations. For e.g CUDA on GPUs. Once defined
+ they remain unchanged throughout the program, unless changed by a call to ops_update_const(..). The ``name'' and``type''
+ parameters **must** be string literals since they are used in the code generation step
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|name |         a name used to identify the constant |
+|dim |           dimension of dataset (number of items per element) |
+|type |          the name of type used for output diagnostics (e.g. ``double'',``float'') |
+|data |          pointer to input data of type *T* |
+
+#### ops_decl_halo
+
+__ops_halo ops_decl_halo(ops_dat from, ops_dat to, int *iter_size, int* from_base, int *to_base, int *from_dir, int *to_dir)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|from | origin dataset |
+|to|  destination dataset |
+|item_size |  defines an iteration size (number of indices to iterate over in each direction) |
+|from_base |  indices of starting point in \"from\" dataset|
+|to_base | indices of starting point in \"to\" dataset |
+|from_dir | direction of incrementing for \"from\" for each dimension of `iter_size` |
+|to_dir |  direction of incrementing for \"to\" for each dimension of `iter_size`|
+
+A from_dir \[1,2\] and a to_dir \[2,1\] means that x in the first block
+goes to y in the second block, and y in first block goes to x in second
+block. A negative sign indicates that the axis is flipped. (Simple
+example: a transfer from (1:2,0:99,0:99) to (-1:0,0:99,0:99) would use
+iter_size = \[2,100,100\], from_base = \[1,0,0\], to_base = \[-1,0,0\],
+from_dir = \[0,1,2\], to_dir = \[0,1,2\]. In more complex case this
+allows for transfers between blocks with different orientations.)
+
+#### ops_decl_halo_hdf5
+
+__ops_halo ops_decl_halo_hdf5(ops_dat from, ops_dat to, char* file)__
+
+This routine reads in a halo relationship between two datasets defined on two different blocks from a named HDF5 file
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|from|      origin dataset|
+|to|        destination dataset|
+|file|      hdf5 file to read and obtain the data from|
+
+#### ops_decl_halo_group
+
+__ops_halo_group ops_decl_halo_group(int nhalos, ops_halo *halos)__
+
+This routine defines a collection of halos. Semantically, when an exchange is triggered for all halos in a group, there is no order defined in which they are carried out.
+| Arguments      | Description |
+| ----------- | ----------- |
+|nhalos|         number of halos in *halos* |
+|halos|           array of halos|
+
+#### ops_decl_reduction_handle}
+
+__ops_reduction ops_decl_reduction_handle(int size, char *type, char *name)__
+This routine defines a reduction handle to be used in a parallel loop
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|size|      size of data in bytes |
+|type|          the name of type used for output diagnostics (e.g. ``double'',``float'') |
+|name|          name of the dat used for output diagnostics|
+
+__{void ops_reduction_result(ops_reduction handle, T *result)
+{This routine returns the reduced value held by a reduction handle. When OPS uses lazy execution, this will trigger the execution of all previously queued OPS operations.}
+
+|handle|  the *ops_reduction* handle |
+|result|  a pointer to write the results to, memory size has to match the declared |
+
+#### ops_partition
+
+__ops_partition(char *method)__
+
+Triggers a multi-block partitioning across a distributed memory set of processes. (links to a dummy function for single node parallelizations). This routine should only be called after all the ops_halo ops_decl_block
+and ops_halo ops_decl_dat statements have been declared
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|method|        string describing the partitioning method. Currently this string is not used internally, but is simply a place-holder to indicate different partitioning methods in the future. |
+
 ### Diagnostic and output routines
 
+#### ops_diagnostic_output
+
+__void ops_diagnostic_output()__
+
+This routine prints out various useful bits of diagnostic info about sets, mappings and datasets. Usually used right
+after an ops_partition() call to print out the details of the decomposition
+
+#### ops_printf
+
+__void ops_printf(const char * format, ...)__
+
+This routine simply prints a variable number of arguments; it is created is in place of the standard C
+printf function which would print the same on each MPI process
+
+#### ops_timers
+
+__void ops_timers(double *cpu, double *et)__
+ gettimeofday() based timer to start/end timing blocks of code
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|cpu|  variable to hold the CPU time at the time of invocation|
+|et| variable to hold the elapsed time at the time of invocation|
+
+#### ops_fetch_block_hdf5_file
+
+__void ops_fetch_block_hdf5_file(ops_block block, char *file)__
+
+Write the details of an ops_block to a named HDF5 file. Can be used over MPI (puts the data in an ops_dat into an
+HDF5 file using MPI I/O)
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|block|  ops_block to be written|
+|file|     hdf5 file to write to|
+
+#### ops_fetch_stencil_hdf5_file
+
+__void ops_fetch_stencil_hdf5_file(ops_stencil stencil, char *file)__
+
+Write the details of an ops_block to a named HDF5 file. Can be used over MPI (puts the data in an ops_dat into an HDF5 file using MPI I/O)
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|stencil|  ops_stencil to be written
+|file|     hdf5 file to write to
+
+#### ops_fetch_dat_hdf5_file
+
+__void ops_fetch_dat_hdf5_file(ops_dat dat, const char *file)__
+
+Write the details of an ops_block to a named HDF5 file. Can be used over MPI (puts the data in an ops_dat into an
+HDF5 file using MPI I/O)
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|  ops_dat to be written|
+|file|     hdf5 file to write to|
+
+#### ops_print_dat_to_txtfile
+
+__void ops_print_dat_to_txtfile(ops_dat dat, chat *file)__
+Write the details of an ops_block to a named text file. When used under an MPI parallelization each MPI process
+will write its own data set separately to the text file. As such it does not use MPI I/O. The data can be viewed using
+a simple text editor
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|  ops_dat to to be written|
+|file|     text file to write to|
+
+#### ops_timing_output}
+
+__void ops_timing_output(FILE *os)__
+
+Print OPS performance performance details to output stream
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|os|    output stream, use stdout to print to standard out|
+
+#### ops_NaNcheck}
+
+__void ops_NaNcheck(ops_dat dat)__
+
+Check if any of the values held in the \texttt{dat} is a NaN. If a NaN
+is found, prints an error message and exits.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|  ops_dat to to be checked|
+
 ### Halo exchange
 
+#### ops_halo_transfer
+
+__void ops_halo_transfer(ops_halo_group group)__
+
+This routine exchanges all halos in a halo group and will block execution of subsequent computations that depend on
+the exchanged data.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|group|         the halo group|
+
 ### Parallel loop syntax
 
+A parallel loop with N arguments has the following syntax:
+
+#### ops_par_loop
+
+__void ops_par_loop(\ void (*kernel)(...),char *name, ops_block block, int dims, int *range, ops_arg arg1,ops_arg arg2, ..., ops_arg argN )__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|kernel|     user's kernel function with N arguments|
+|name|       name of kernel function, used for output diagnostics|
+|block|      the ops_block over which this loop executes|
+|dims|       dimension of loop iteration|
+|range|      iteration range array|
+|args|       arguments|
+
+The {\bf ops_arg} arguments in {\bf ops_par_loop} are provided by one of the
+following routines, one for global constants and reductions, and the other
+for OPS datasets.
+
+#### ops_arg_gbl
+
+__ops_arg ops_arg_gbl(T *data, int dim, char *type, ops_access acc)__
+
+Passes a scalar or small array that is invariant of the iteration space (not to be confused with ops_decl_const, which facilitates global scope variables).
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|data|       data array|
+|dim|        array dimension|
+|type|       string representing the type of data held in data|
+|acc|        access type|
+
+#### ops_arg_reduce
+
+__ops_arg ops_arg_reduce(ops_reduction handle, int dim, char *type, ops_access acc)__
+
+Passes a pointer to a variable that needs to be incremented (or swapped for min/max reduction) by the user kernel.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|handle|       an  *ops_reduction* handle|
+|dim|        array dimension (according to *type*)|
+|type|       string representing the type of data held in data|
+|acc|        access type|
+
+#### ops_arg_dat
+
+__ops_arg ops_arg_dat(ops_dat dat, ops_stencil stencil, char *type,ops_access acc)__
+
+Passes a pointer wrapped in ac ACC<> object to the value(s) at the current grid point to the user kernel. The ACC object's parentheses operator has to be used for dereferencing the pointer.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|        dataset|
+|stencil|    stencil for accessing data|
+|type|       string representing the type of data held in dataset|
+|acc|        access type|
+
+#### ops_arg_idx
+
+__ops_arg ops_arg_idx()__
+
+Give you an array of integers (in the user kernel) that have the index of
+the current grid point, i.e. idx[0] is the index in x, idx[1] is the index in y, etc. This is a globally consistent
+index, so even if the block is  distributed across different MPI partitions, it gives you the same indexes. Generally
+used to generate initial geometry.
+
 ### Stencils
 
+The final ingredient is the stencil specification, for which we have two versions: simple and strided.
+
+#### ops_decl_stencil
+
+__ops_stencil ops_decl_stencil(int dims,int points, int *stencil, char *name)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dims|     dimension of loop iteration|
+|points|   number of points in the stencil|
+|stencil|  stencil for accessing data|
+|name| string representing the name of the stencil|
+
+#### ops_decl_strided_stencil
+
+__ops_stencil ops_decl_strided_stencil(int dims, int points, int *stencil, int *stride, char *name)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dims|       dimension of loop iteration|
+|points|     number of points in the stencil|
+|stencil|    stencil for accessing data|
+|stride|     stride for accessing data|
+|name| string representing the name of the stencil|
+
+#### ops_decl_stencil_hdf5
+
+__ops_stencil ops_decl_stencil_hdf5(int dims,int points, char *name, char* file)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dims|     dimension of loop iteration|
+|points|   number of points in the stencil|
+|name|     string representing the name of the stencil|
+|file|     hdf5 file to write to|
+
+ In the strided case, the semantics for the index of data to be
+accessed, for stencil point*p*, in dimension *m* are defined as
+
+```c++
+ stride[m]*loop_index[m] + stencil[p*dims+m]
+```
+
+ where ``loop_index[m]`` is the iteration index (within the
+user-defined iteration space) in the different dimensions.
+
+If, for one or more dimensions, both ``stride[m]`` and
+``stencil[p*dims+m]`` are zero, then one of the following must be true;
+
+* the dataset being referenced has size 1 for these dimensions
+
+* these dimensions are to be omitted and so the dataset has
+dimension equal to the number of remaining dimensions.
+
+See *OPS/apps/c/CloverLeaf/build_field.cpp* and *OPS/apps/c/CloverLeaf/generate.cpp* for an example *ops_decl_strided_stencil* declaration and its use in a loop,respectively.
+
+These two stencil definitions probably take care of all of the
+cases in the Introduction except for multiblock applications with interfaces
+with different orientations -- this will need a third, even more general,
+stencil specification. The strided stencil will handle both multigrid
+(with a stride of 2 for example) and the boundary condition and reduced
+dimension applications (with a stride of 0 for the relevant dimensions).
+
 ### Checkpointing
 
+OPS supports the automatic checkpointing of applications. Using the API below, the user specifies the file name for the
+checkpoint and an average time interval between checkpoints, OPS will then automatically save all necessary information
+periodically that is required to fast-forward to the last checkpoint if a crash occurred. Currently, when re-launching
+after a crash, the same number of MPI processes have to be used. To enable checkpointing mode, the *OPS_CHECKPOINT* runtime argument has to be used.
+
+#### ops_checkpointing_init
+
+__bool ops_checkpointing_init(const char *filename, double interval, int options)__
+
+Initialises the checkpointing system, has to be called after {\tt ops_partition}. Returns true if the application launches in restore
+mode, false otherwise.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|filename| name of the file for checkpointing. In MPI, this will automatically be post-fixed with the rank ID.|
+|interval| average time (seconds) between checkpoints|
+|options| a combinations of flags, listed in *ops_checkpointing.h*, also see below|
+
+* OPS_CHECKPOINT_INITPHASE - indicates that there are a number of parallel loops at the very beginning of the simulations which should be excluded from any checkpoint; mainly because they initialise datasets that do not change during the main body of the execution. During restore mode these loops are executed as usual. An example would be the computation of the mesh geometry, which can be excluded from the checkpoint if it is re-computed when recovering and restoring a checkpoint. The API call *void ops_checkpointing_initphase_done()* indicates the end of this initial phase.
+
+* OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually controls the location of the checkpoint, and explicitly specifies the list of \texttt{ops_dat}s to be saved.
+
+* OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the location of the checkpoint, and it also enables fast-forwarding, by skipping the execution of the
+application (even though none of the parallel loops would actually execute, there may be significant work outside of those) up to the checkpoint
+
+* OPS_CHECKPOINT_MANUAL - Indicates that when the corresponding API function is called, the checkpoint should be created. Assumes the presence of the above two options as well.
+
+#### ops_checkpointing_manual_datlist
+
+__void ops_checkpointing_manual_datlist(int ndats, ops_dat *datlist)__
+
+A user can call this routine at a point in the code to mark the location of a checkpoint.  At this point, the list of datasets specified
+will be saved. The validity of what is saved is not checked by the checkpointing algorithm assuming that the user knows
+what data sets to be saved for full recovery. This routine should be called frequently (compared to check-pointing
+frequency) and it will trigger the creation of the checkpoint the first time it is called after the timeout occurs.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndats| number of datasets to be saved|
+|datlist| arrays of *ops_dat* handles to be saved|
+
+#### ops_checkpointing_fastfw
+
+__bool ops_checkpointing_fastfw(int nbytes, char *payload)__
+
+A use can call this routine at a point in the code to mark the location of a checkpoint.  At this point, the
+specified payload (e.g. iteration count, simulation time, etc.) along with the necessary datasets, as determined by the
+checkpointing algorithm will be saved. This routine should be called frequently (compared to checkpointing frequency),
+will trigger the creation of the checkpoint the first time it is called after the timeout occurs. In restore mode,
+will restore all datasets the first time it is called, and returns true indicating that the saved payload is returned
+in payload. Does not save reduction data.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|nbytes| size of the payload in bytes|
+|payload| pointer to memory into which the payload is packed|
+
+#### ops_checkpointing_manual_datlist_fastfw
+
+__bool ops_checkpointing_manual_datlist_fastfw(int ndats, op_dat *datlist, int nbytes, char *payload)__
+
+Combines the manual datlist and fastfw calls.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndats| number of datasets to be saved|
+|datlist| arrays of *ops_dat* handles to be saved|
+|nbytes| size of the payload in bytes|
+|payload| pointer to memory into which the payload is packed|
+
+#### ops_checkpointing_manual_datlist_fastfw_trigger
+
+__bool ops_checkpointing_manual_datlist_fastfw_trigger(int ndats, opa_dat *datlist, int
+nbytes, char *payload)__
+
+With this routine it is possible to manually trigger checkpointing, instead of relying on the timeout process. as such
+it combines the manual datlist and fastfw calls, and triggers the creation of a checkpoint when called.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndats| number of datasets to be saved|
+|datlist| arrays of *ops_dat* handles to be saved|
+|nbytes| size of the payload in bytes|
+|payload| pointer to memory into which the payload is packed|
+
+\noindent The suggested use of these \textbf{manual} functions is of course when the optimal location for checkpointing
+is known - one of the ways to determine that is to use the built-in algorithm. More details of this will be reported
+in a tech-report on checkpointing, to be published later.
+
 ### Access to OPS data
+
+his section describes APIS that give the user access to internal data structures in OPS and return data to user-space. These should be used cautiously and sparsely, as they can affect performance significantly
+
+#### ops_dat_get_local_npartitions
+
+__int ops_dat_get_local_npartitions(ops_dat dat)__
+
+This routine returns the number of chunks of the given dataset held by the current process.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+
+#### ops_dat_get_global_npartitions}
+
+__int ops_dat_get_global_npartitions(ops_dat dat)}
+{This routine returns the number of chunks of the given dataset held by all processes.}
+|dat|         the dataset
+
+#### ops_dat_get_extents
+
+__void ops_dat_get_extents(ops_dat dat, int part, int *disp, int *sizes)__
+
+This routine returns the MPI displacement and size of a given chunk of the given dataset on the current process.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|disp|        an array populated with the displacement of the chunk within the ``global'' distributed array|
+|sizes|       an array populated with the spatial extents|
+
+#### ops_dat_get_raw_metadata
+
+__char* ops_dat_get_raw_metadata(ops_dat dat, int part, int *disp, int *size, int *stride, int *d_m, int *d_p)__
+
+This routine returns array shape metadata corresponding to the ops_dat. Any of the arguments that are not of interest, may be NULL.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|disp|        an array populated with the displacement of the chunk within the ``global'' distributed array|
+|size|       an array populated with the spatial extents
+|stride|      an array populated strides in spatial dimensions needed for column-major indexing|
+|d_m|      an array populated with padding on the left in each dimension. Note that these are negative values|
+|d_p|      an array populated with padding on the right in each dimension|
+
+#### ops_dat_get_raw_pointer
+
+__char* ops_dat_get_raw_pointer(ops_dat dat, int part, ops_stencil stencil, ops_memspace *memspace)__
+
+This routine returns a pointer to the internally stored data, with MPI halo regions automatically updated as required by the supplied stencil. The strides required to index into the dataset are also given.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|stencil|     a stencil used to determine required MPI halo exchange depths|
+|memspace|       when set to OPS_HOST or OPS_DEVICE, returns a pointer to data in that memory space, otherwise must be set to 0, and returns whether data is in the host or on the device|
+
+#### ops_dat_release_raw_data
+
+__void ops_dat_release_raw_data(ops_dat dat, int part, ops_access acc)__
+
+Indicates to OPS that a dataset previously accessed with ops_dat_get_raw_pointer is released by the user, and also tells OPS how it was accessed.
+
+A single call to ops_dat_release_raw_data() releases all pointers obtained by previous calls to ops_dat_get_raw_pointer() calls on the same dat and with the same *memspace argument, i.e. calls do not nest.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset
+|part|        the chunk index (has to be 0)|
+|acc|     the kind of access that was used by the user (OPS_READ if it was read only, OPS_WRITE if it was overwritten, OPS_RW if it was read and written)|
+
+#### ops_dat_release_raw_data
+
+__void ops_dat_release_raw_data_memspace(ops_dat dat, int part, ops_access acc, ops_memspace *memspace)__
+
+Indicates to OPS that a dataset previously accessed with ops_dat_get_raw_pointer is released by the user, and also tells OPS how it was accessed, and which memory space was used.
+
+A single call to ops_dat_release_raw_data() releases all pointers obtained by previous calls to ops_dat_get_raw_pointer() calls on the same dat and with the same *memspace argument, i.e. calls do not nest.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|acc|     the kind of access that was used by the user (OPS_READ if it was read only, OPS_WRITE if it was overwritten, OPS_RW if it was read and written)|
+|memspace|       set to OPS_HOST or OPS_DEVICE |
+
+#### ops_dat_fetch_data
+
+__void ops_dat_fetch_data(ops_dat dat, int part, int *data)__
+This routine copies the data held by OPS to the user-specified memory location, which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0) |
+|data|        pointer to memory which should be filled by OPS|
+
+#### ops_dat_set_data
+
+__void ops_dat_set_data(ops_dat dat, int part, int *data)__
+
+This routine copies the data given  by the user to the internal data structure used by OPS. User data needs to be laid out in column-major order and strided as indicated by the sizes parameter of ops_dat_get_extents.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|data|        pointer to memory which should be copied to OPS |

From 82c29e98fdcdaa544c46fd0257e29bff8c429f4f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 1 Oct 2021 13:30:50 +0100
Subject: [PATCH 250/324] Update opsapi.md

---
 doc/opsapi.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index f9c91351fb..97ffbeea59 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -9,8 +9,8 @@ The key characteristic of structured mesh applications is the implicit connectiv
 The OPS API allows to declare a computation over such multi-block structured meshes. An OPS application can generally be declared in two key parts: (1) initialisation and (2) iteration over the mesh (carried out as a parallel loop). During the initialisation phase, one or more blocks (we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
 decompose the problem.
 
-The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration $(i,j)$, we wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
-for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size $(N,1)$, then we will need a stencil with stride $(1,0)$ to access it in a regular 2D loop.
+The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration *(i,j)*, we wish to access *(i-1,j)* and *(i,j)*, then the stencil will have two points: *{(-1, 0), (0, 0)}*. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size *(N,1)*, then we will need a stencil with stride *(1,0)* to access it in a regular 2D loop.
 
 Finally, the initialisation phase may declare a number of global constants - these are variables in global scope that can be accessed from within elemental kernels, without having to pass them in explicitly. These may be scalars or small arrays, generally for values that do not change during execution, though they may be updated during execution
 with repeated calls to `ops_decl_const`.

From e88a398a13c06547fea847e76cb96f98691a57f7 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 1 Oct 2021 13:44:47 +0100
Subject: [PATCH 251/324] Update opsapi.md

---
 doc/opsapi.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 97ffbeea59..058754879c 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -9,8 +9,8 @@ The key characteristic of structured mesh applications is the implicit connectiv
 The OPS API allows to declare a computation over such multi-block structured meshes. An OPS application can generally be declared in two key parts: (1) initialisation and (2) iteration over the mesh (carried out as a parallel loop). During the initialisation phase, one or more blocks (we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
 decompose the problem.
 
-The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration *(i,j)*, we wish to access *(i-1,j)* and *(i,j)*, then the stencil will have two points: *{(-1, 0), (0, 0)}*. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
-for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size *(N,1)*, then we will need a stencil with stride *(1,0)* to access it in a regular 2D loop.
+The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration $(i,j)$, we wish to access $(i-1,j)$ and $(i,j)$, then the stencil will have two points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size $(N,1)$, then we will need a stencil with stride $(1,0)$ to access it in a regular 2D loop.
 
 Finally, the initialisation phase may declare a number of global constants - these are variables in global scope that can be accessed from within elemental kernels, without having to pass them in explicitly. These may be scalars or small arrays, generally for values that do not change during execution, though they may be updated during execution
 with repeated calls to `ops_decl_const`.

From 18a45ea47825133206bbd80a8908d64624043255 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sat, 9 Oct 2021 21:54:02 +0100
Subject: [PATCH 252/324] Fix a few typos/latex tags in API

---
 doc/opsapi.md | 52 +++++++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 058754879c..3b81698c27 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -116,15 +116,15 @@ This routine defines a dataset.
 
 | Arguments      | Description |
 | ----------- | ----------- |
-block   |      structured block |
-dim     |      dimension of dataset (number of items per grid element) |
-size    |  size in each dimension of the block |
-base    |  base indices in each dimension of the block |
-d_m    |  padding from the face in the negative direction for each dimension (used for block halo) |
-d_p    |  padding from the face in the positive direction for each dimension (used for block halo) |
-data    |     input data of type *T* |
-type     |     the name of type used for output diagnostics (e.g. ``double``,``float``)|
-name     |     a name used for output diagnostics|
+|block   |      structured block |
+|dim     |      dimension of dataset (number of items per grid element) |
+|size    |  size in each dimension of the block |
+|base    |  base indices in each dimension of the block |
+|d_m    |  padding from the face in the negative direction for each dimension (used for block halo) |
+|d_p    |  padding from the face in the positive direction for each dimension (used for block halo) |
+|data    |     input data of type *T* |
+|type     |     the name of type used for output diagnostics (e.g. ``double``,``float``)|
+|name     |     a name used for output diagnostics|
 
 The `size` allows to declare different sized data arrays on a given
 `block`. `d_m` and `d_p` are depth of the "block halos" that are used to
@@ -158,7 +158,7 @@ This routine defines a global constant: a variable in global scope. Global const
 | ----------- | ----------- |
 |name |         a name used to identify the constant |
 |dim |           dimension of dataset (number of items per element) |
-|type |          the name of type used for output diagnostics (e.g. ``double'',``float'') |
+|type |          the name of type used for output diagnostics (e.g. ``double``, ``float``) |
 |data |          pointer to input data of type *T* |
 
 #### ops_decl_halo
@@ -205,7 +205,7 @@ This routine defines a collection of halos. Semantically, when an exchange is tr
 |nhalos|         number of halos in *halos* |
 |halos|           array of halos|
 
-#### ops_decl_reduction_handle}
+#### ops_decl_reduction_handle
 
 __ops_reduction ops_decl_reduction_handle(int size, char *type, char *name)__
 This routine defines a reduction handle to be used in a parallel loop
@@ -213,7 +213,7 @@ This routine defines a reduction handle to be used in a parallel loop
 | Arguments      | Description |
 | ----------- | ----------- |
 |size|      size of data in bytes |
-|type|          the name of type used for output diagnostics (e.g. ``double'',``float'') |
+|type|          the name of type used for output diagnostics (e.g. ``double``,``float``) |
 |name|          name of the dat used for output diagnostics|
 
 __{void ops_reduction_result(ops_reduction handle, T *result)
@@ -231,7 +231,7 @@ and ops_halo ops_decl_dat statements have been declared
 
 | Arguments      | Description |
 | ----------- | ----------- |
-|method|        string describing the partitioning method. Currently this string is not used internally, but is simply a place-holder to indicate different partitioning methods in the future. |
+|method| string describing the partitioning method. Currently this string is not used internally, but is simply a place-holder to indicate different partitioning methods in the future. |
 
 ### Diagnostic and output routines
 
@@ -306,7 +306,7 @@ a simple text editor
 |dat|  ops_dat to to be written|
 |file|     text file to write to|
 
-#### ops_timing_output}
+#### ops_timing_output
 
 __void ops_timing_output(FILE *os)__
 
@@ -316,11 +316,11 @@ Print OPS performance performance details to output stream
 | ----------- | ----------- |
 |os|    output stream, use stdout to print to standard out|
 
-#### ops_NaNcheck}
+#### ops_NaNcheck
 
 __void ops_NaNcheck(ops_dat dat)__
 
-Check if any of the values held in the \texttt{dat} is a NaN. If a NaN
+Check if any of the values held in the *dat* is a NaN. If a NaN
 is found, prints an error message and exits.
 
 | Arguments      | Description |
@@ -346,7 +346,7 @@ A parallel loop with N arguments has the following syntax:
 
 #### ops_par_loop
 
-__void ops_par_loop(\ void (*kernel)(...),char *name, ops_block block, int dims, int *range, ops_arg arg1,ops_arg arg2, ..., ops_arg argN )__
+__void ops_par_loop(void (*kernel)(...),char *name, ops_block block, int dims, int *range, ops_arg arg1,ops_arg arg2, ..., ops_arg argN )__
 
 | Arguments      | Description |
 | ----------- | ----------- |
@@ -357,7 +357,7 @@ __void ops_par_loop(\ void (*kernel)(...),char *name, ops_block block, int dims,
 |range|      iteration range array|
 |args|       arguments|
 
-The {\bf ops_arg} arguments in {\bf ops_par_loop} are provided by one of the
+The **ps_arg** arguments in **ops_par_loop** are provided by one of the
 following routines, one for global constants and reductions, and the other
 for OPS datasets.
 
@@ -454,7 +454,7 @@ accessed, for stencil point*p*, in dimension *m* are defined as
  stride[m]*loop_index[m] + stencil[p*dims+m]
 ```
 
- where ``loop_index[m]`` is the iteration index (within the
+where ``loop_index[m]`` is the iteration index (within the
 user-defined iteration space) in the different dimensions.
 
 If, for one or more dimensions, both ``stride[m]`` and
@@ -485,7 +485,7 @@ after a crash, the same number of MPI processes have to be used. To enable check
 
 __bool ops_checkpointing_init(const char *filename, double interval, int options)__
 
-Initialises the checkpointing system, has to be called after {\tt ops_partition}. Returns true if the application launches in restore
+Initialises the checkpointing system, has to be called after *ops_partition*. Returns true if the application launches in restore
 mode, false otherwise.
 
 | Arguments      | Description |
@@ -496,7 +496,7 @@ mode, false otherwise.
 
 * OPS_CHECKPOINT_INITPHASE - indicates that there are a number of parallel loops at the very beginning of the simulations which should be excluded from any checkpoint; mainly because they initialise datasets that do not change during the main body of the execution. During restore mode these loops are executed as usual. An example would be the computation of the mesh geometry, which can be excluded from the checkpoint if it is re-computed when recovering and restoring a checkpoint. The API call *void ops_checkpointing_initphase_done()* indicates the end of this initial phase.
 
-* OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually controls the location of the checkpoint, and explicitly specifies the list of \texttt{ops_dat}s to be saved.
+* OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually controls the location of the checkpoint, and explicitly specifies the list of *ops_dat*s to be saved.
 
 * OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the location of the checkpoint, and it also enables fast-forwarding, by skipping the execution of the
 application (even though none of the parallel loops would actually execute, there may be significant work outside of those) up to the checkpoint
@@ -561,7 +561,7 @@ it combines the manual datlist and fastfw calls, and triggers the creation of a
 |nbytes| size of the payload in bytes|
 |payload| pointer to memory into which the payload is packed|
 
-\noindent The suggested use of these \textbf{manual} functions is of course when the optimal location for checkpointing
+The suggested use of these **manual** functions is of course when the optimal location for checkpointing
 is known - one of the ways to determine that is to use the built-in algorithm. More details of this will be reported
 in a tech-report on checkpointing, to be published later.
 
@@ -581,8 +581,12 @@ This routine returns the number of chunks of the given dataset held by the curre
 
 #### ops_dat_get_global_npartitions}
 
-__int ops_dat_get_global_npartitions(ops_dat dat)}
-{This routine returns the number of chunks of the given dataset held by all processes.}
+__int ops_dat_get_global_npartitions(ops_dat dat)__
+
+This routine returns the number of chunks of the given dataset held by all processes.
+
+| Arguments      | Description |
+| ----------- | ----------- |
 |dat|         the dataset
 
 #### ops_dat_get_extents

From fe1dce6987aa14149d3206e10fe69f647c428987 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sat, 9 Oct 2021 23:11:34 +0100
Subject: [PATCH 253/324] Add user kernel example to deveanapp

---
 doc/devanapp.md |  76 +++++
 doc/user.md     | 839 ------------------------------------------------
 2 files changed, 76 insertions(+), 839 deletions(-)
 delete mode 100644 doc/user.md

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 3004738caf..26a8f14339 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,5 +1,81 @@
 # Developing an OPS Application
+
 ## Tutorial
+##
+## OPS User Kernels
+
+In OPS, the elemental operation carried out per mesh/grid point is
+specified as an outlined function called a *user kernel*. An example
+taken from the Cloverleaf application is given below.
+
+```c++
+void accelerate_kernel( const ACC<double> &density0, const ACC<double> &volume,
+                ACC<double> &stepbymass, const ACC<double> &xvel0, ACC<double> &xvel1,
+                const ACC<double> &xarea, const ACC<double> &pressure,
+                const ACC<double> &yvel0, ACC<double> &yvel1,
+                const ACC<double> &yarea, const ACC<double> &viscosity) {
+
+  double nodal_mass;
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  nodal_mass = ( density0(-1,-1) * volume(-1,-1)
+    + density0(0,-1) * volume(0,-1)
+    + density0(0,0) * volume(0,0)
+    + density0(-1,0) * volume(-1,0) ) * 0.25;
+
+  stepbymass(0,0) = 0.5*dt/ nodal_mass;
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, 0,-1};
+
+  xvel1(0,0) = xvel0(0,0) - stepbymass(0,0) *
+            ( xarea(0,0)  * ( pressure(0,0) - pressure(-1,0) ) +
+              xarea(0,-1) * ( pressure(0,-1) - pressure(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, -1,0};
+
+  yvel1(0,0) = yvel0(0,0) - stepbymass(0,0) *
+            ( yarea(0,0)  * ( pressure(0,0) - pressure(0,-1) ) +
+              yarea(-1,0) * ( pressure(-1,0) - pressure(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, 0,-1};
+
+  xvel1(0,0) = xvel1(0,0) - stepbymass(0,0) *
+            ( xarea(0,0) * ( viscosity(0,0) - viscosity(-1,0) ) +
+              xarea(0,-1) * ( viscosity(0,-1) - viscosity(-1,-1) ) );
+
+  //{0,0, -1,0, 0,-1, -1,-1};
+  //{0,0, -1,0};
+
+  yvel1(0,0) = yvel1(0,0) - stepbymass(0,0) *
+            ( yarea(0,0) * ( viscosity(0,0) - viscosity(0,-1) ) +
+              yarea(-1,0) * ( viscosity(-1,0) - viscosity(-1,-1) ) );
+
+}
+```
+
+This user kernel is then used in an `ops_par_loop` function. The key aspect to note in the user kernel is the use of the ACC\<\> objects and their
+parentheses operator. These specify the stencil in accessing the
+elements of the respective data arrays.
+
+```c++
+int rangexy_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1};
+
+ops_par_loop(accelerate_kernel, "accelerate_kernel", clover_grid, 2, rangexy_inner_plus1,
+     ops_arg_dat(density0, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(volume, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE),
+     ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_READ),
+     ops_arg_dat(xvel1, 1, S2D_00, "double", OPS_INC),
+     ops_arg_dat(xarea, 1, S2D_00_0M1, "double", OPS_READ),
+     ops_arg_dat(pressure, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
+     ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_READ),
+     ops_arg_dat(yvel1, 1, S2D_00, "double", OPS_INC),
+     ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ),
+     ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ));
+```
 ## Supported Paralleizations
 ## Code-generation Flags
 ## File I/O
diff --git a/doc/user.md b/doc/user.md
deleted file mode 100644
index d3ebca8478..0000000000
--- a/doc/user.md
+++ /dev/null
@@ -1,839 +0,0 @@
----
-author:
-- Mike Giles, Istvan Reguly, Gihan Mudalige
-date: May 2019
-title: OPS C++ User's Manual
----
-
-
-
-
-
-# OPS C++ API
-
-## Initialisation declaration and termination routines
-
-###  {#section .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the usual command line arguments
-
-an integer which defines the level of debugging diagnostics and
-reporting to be performed
-:::
-
-Currently, higher `diags_level`s does the following checks\
-`diags_level` $=$ 1 : no diagnostics, default to achieve best runtime
-performance.\
-`diags_level` $>$ 1 : print block decomposition and `ops_par_loop`
-timing breakdown.\
-`diags_level` $>$ 4 : print intra-block halo buffer allocation feedback
-(for OPS internal development only)\
-`diags_level` $>$ 5 : check if intra-block halo MPI sends depth match
-MPI receives depth (for OPS internal development only)\
-
-###  {#section-1 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dimension of the block
-
-a name used for output diagnostics
-:::
-
-###  {#section-2 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dimension of the block
-
-a name used for output diagnostics
-
-hdf5 file to read and obtain the block information from
-:::
-
-Although this routine does not read in any extra information about the
-block from the named HDF5 file than what is already specified in the
-arguments, it is included here for error checking (e.g. check if blocks
-defined in an HDF5 file is matching with the declared arguments in an
-application) and completeness.\
-
-###  {#section-3 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-structured block
-
-dimension of dataset (number of items per grid element)
-
-size in each dimension of the block
-
-base indices in each dimension of the block
-
-padding from the face in the negative direction for each dimension (used
-for block halo)
-
-padding from the face in the positive direction for each dimension (used
-for block halo)
-
-input data of type `T`
-
-the name of type used for output diagnostics (e.g. "double", "float")
-
-a name used for output diagnostics
-:::
-
-The `size` allows to declare different sized data arrays on a given
-`block`. `d_m` and `d_p` are depth of the "block halos" that are used to
-indicate the offset from the edge of a block (in both the negative and
-positive directions of each dimension).\
-\
-
-###  {#section-4 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-structured block
-
-dimension of dataset (number of items per grid element)
-
-the name of type used for output diagnostics (e.g. "double", "float")
-
-name of the dat used for output diagnostics
-
-hdf5 file to read and obtain the data from
-:::
-
-###  {#section-5 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-a name used to identify the constant
-
-dimension of dataset (number of items per element)
-
-the name of type used for output diagnostics (e.g. "double", "float")
-
-pointer to input data of type `T`
-:::
-
-###  {#section-6 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-a name used to identify the constant
-
-dimension of dataset (number of items per element)
-
-the name of type used for output diagnostics (e.g. "double", "float")
-
-pointer to new values for constant of type `T`
-:::
-
-###  {#section-7 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-origin dataset
-
-destination dataset
-
-defines an iteration size (number of indices to iterate over in each
-direction)
-
-indices of starting point in \"from\" dataset
-
-indices of starting point in \"to\" dataset
-
-direction of incrementing for \"from\" for each dimension of `iter_size`
-
-direction of incrementing for \"to\" for each dimension of `iter_size`
-:::
-
-A from_dir \[1,2\] and a to_dir \[2,1\] means that x in the first block
-goes to y in the second block, and y in first block goes to x in second
-block. A negative sign indicates that the axis is flipped. (Simple
-example: a transfer from (1:2,0:99,0:99) to (-1:0,0:99,0:99) would use
-iter_size = \[2,100,100\], from_base = \[1,0,0\], to_base = \[-1,0,0\],
-from_dir = \[0,1,2\], to_dir = \[0,1,2\]. In more complex case this
-allows for transfers between blocks with different orientations.)\
-
-###  {#section-8 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-origin dataset
-
-destination dataset
-
-hdf5 file to read and obtain the data from
-:::
-
-###  {#section-9 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-number of halos in `halos`
-
-array of halos
-:::
-
-###  {#section-10 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-size of data in bytes
-
-the name of type used for output diagnostics (e.g. "double", "float")
-
-name of the dat used for output diagnostics
-:::
-
-::: list
-plus 1pt minus 1pt
-
-the `ops_reduction` handle
-
-a pointer to write the results to, memory size has to match the declared
-:::
-
-###  {#section-11 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-string describing the partitioning method. Currently this string is not
-used internally, but is simply a place-holder to indicate different
-partitioning methods in the future.
-:::
-
-###  {#section-12 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-:::
-
-## Diagnostics and output routines
-
-###  {#section-13 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-:::
-
-###  {#section-14 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-:::
-
-###  {#section-15 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-variable to hold the CPU time at the time of invocation
-
-variable to hold the elapsed time at the time of invocation
-:::
-
-###  {#section-16 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-ops_block to be written
-
-hdf5 file to write to
-:::
-
-###  {#section-17 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-ops_stencil to be written
-
-hdf5 file to write to
-:::
-
-###  {#section-18 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-ops_dat to be written
-
-hdf5 file to write to
-:::
-
-###  {#section-19 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-ops_dat to to be written
-
-text file to write to
-:::
-
-###  {#section-20 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-output stream, use stdout to print to standard out
-:::
-
-###  {#section-21 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-ops_dat to to be checked
-:::
-
-## Halo exchange
-
-###  {#section-22 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the halo group
-:::
-
-## Parallel loop syntax
-
-A parallel loop with N arguments has the following syntax:
-
-###  {#section-23 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-user's kernel function with N arguments
-
-name of kernel function, used for output diagnostics
-
-the ops_block over which this loop executes
-
-dimension of loop iteration
-
-iteration range array
-
-arguments
-:::
-
-The **ops_arg** arguments in **ops_par_loop** are provided by one of the
-following routines, one for global constants and reductions, and the
-other for OPS datasets.
-
-###  {#section-24 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-data array
-
-array dimension
-
-string representing the type of data held in data
-
-access type
-:::
-
-###  {#section-25 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-an `ops_reduction` handle
-
-array dimension (according to `type`)
-
-string representing the type of data held in data
-
-access type
-:::
-
-###  {#section-26 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dataset
-
-stencil for accessing data
-
-string representing the type of data held in dataset
-
-access type
-:::
-
-###  {#section-27 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-:::
-
-## Stencils
-
-The final ingredient is the stencil specification, for which we have two
-versions: simple and strided.\
-
-###  {#section-28 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dimension of loop iteration
-
-number of points in the stencil
-
-stencil for accessing data
-
-string representing the name of the stencil
-:::
-
-###  {#section-29 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dimension of loop iteration
-
-number of points in the stencil
-
-stencil for accessing data
-
-stride for accessing data
-
-string representing the name of the stencil\
-:::
-
-###  {#section-30 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-dimension of loop iteration
-
-number of points in the stencil
-
-string representing the name of the stencil
-
-hdf5 file to write to
-:::
-
-In the strided case, the semantics for the index of data to be accessed,
-for stencil point `p`, in dimension `m` are defined as:\
-,\
-where `loop_index[m]` is the iteration index (within the user-defined
-iteration space) in the different dimensions.
-
-If, for one or more dimensions, both `stride[m]` and `stencil[p*dims+m]`
-are zero, then one of the following must be true;
-
--   the dataset being referenced has size 1 for these dimensions
-
--   these dimensions are to be omitted and so the dataset has dimension
-    equal to the number of remaining dimensions.
-
-See `OPS/apps/c/CloverLeaf/build_field.cpp` and
-`OPS/apps/c/CloverLeaf/generate.cpp` for an example
-`ops_decl_strided_stencil` declaration and its use in a loop,
-respectively.\
-These two stencil definitions probably take care of all of the cases in
-the Introduction except for multiblock applications with interfaces with
-different orientations -- this will need a third, even more general,
-stencil specification. The strided stencil will handle both multigrid
-(with a stride of 2 for example) and the boundary condition and reduced
-dimension applications (with a stride of 0 for the relevant dimensions).
-
-## Checkpointing
-
-OPS supports the automatic checkpointing of applications. Using the API
-below, the user specifies the file name for the checkpoint and an
-average time interval between checkpoints, OPS will then automatically
-save all necessary information periodically that is required to
-fast-forward to the last checkpoint if a crash occurred. Currently, when
-re-launching after a crash, the same number of MPI processes have to be
-used. To enable checkpointing mode, the `OPS_CHECKPOINT` runtime
-argument has to be used.\
-
-###  {#section-31 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-name of the file for checkpointing. In MPI, this will automatically be
-post-fixed with the rank ID.
-
-average time (seconds) between checkpoints
-
-a combinations of flags, listed in `ops_checkpointing.h`:\
-OPS_CHECKPOINT_INITPHASE - indicates that there are a number of parallel
-loops at the very beginning of the simulations which should be excluded
-from any checkpoint; mainly because they initialise datasets that do not
-change during the main body of the execution. During restore mode these
-loops are executed as usual. An example would be the computation of the
-mesh geometry, which can be excluded from the checkpoint if it is
-re-computed when recovering and restoring a checkpoint. The API call
-void `ops_checkpointing_initphase_done()` indicates the end of this
-initial phase.
-
-OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually
-controls the location of the checkpoint, and explicitly specifies the
-list of `ops_dat`s to be saved.
-
-OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the
-location of the checkpoint, and it also enables fast-forwarding, by
-skipping the execution of the application (even though none of the
-parallel loops would actually execute, there may be significant work
-outside of those) up to the checkpoint.
-
-OPS_CHECKPOINT_MANUAL - Indicates that when the corresponding API
-function is called, the checkpoint should be created. Assumes the
-presence of the above two options as well.
-:::
-
-###  {#section-32 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-number of datasets to be saved
-
-arrays of `ops_dat` handles to be saved
-:::
-
-###  {#section-33 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-size of the payload in bytes
-
-pointer to memory into which the payload is packed
-:::
-
-###  {#section-34 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-number of datasets to be saved
-
-arrays of `ops_dat` handles to be saved
-
-size of the payload in bytes
-
-pointer to memory into which the payload is packed
-:::
-
-###  {#section-35 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-number of datasets to be saved
-
-arrays of `ops_dat` handles to be saved
-
-size of the payload in bytes
-
-pointer to memory into which the payload is packed
-:::
-
-The suggested use of these **manual** functions is of course when the
-optimal location for checkpointing is known - one of the ways to
-determine that is to use the built-in algorithm. More details of this
-will be reported in a tech-report on checkpointing, to be published
-later.
-
-## Access to OPS data
-
-This section describes APIS that give the user access to internal data
-structures in OPS and return data to user-space. These should be used
-cautiously and sparsely, as they can affect performance significantly
-
-###  {#section-36 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-:::
-
-###  {#section-37 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-:::
-
-###  {#section-38 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-an array populated with the displacement of the chunk within the
-"global" distributed array
-
-an array populated with the spatial extents
-:::
-
-###  {#section-39 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-an array populated with the displacement of the chunk within the
-"global" distributed array
-
-an array populated with the spatial extents
-
-an array populated strides in spatial dimensions needed for column-major
-indexing
-
-an array populated with padding on the left in each dimension. Note that
-these are negative values
-
-an array populated with padding on the right in each dimension
-:::
-
-###  {#section-40 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-a stencil used to determine required MPI halo exchange depths
-
-when set to OPS_HOST or OPS_DEVICE, returns a pointer to data in that
-memory space, otherwise must be set to 0, and returns whether data is in
-the host or on the device
-:::
-
-###  {#section-41 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-the kind of access that was used by the user (OPS_READ if it was read
-only, OPS_WRITE if it was overwritten, OPS_RW if it was read and
-written)
-:::
-
-###  {#section-42 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-the kind of access that was used by the user (OPS_READ if it was read
-only, OPS_WRITE if it was overwritten, OPS_RW if it was read and
-written)
-
-set to OPS_HOST or OPS_DEVICE
-:::
-
-###  {#section-43 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-pointer to memory which should be filled by OPS
-:::
-
-###  {#section-44 .unnumbered}
-
-::: list
-plus 1pt minus 1pt
-
-the dataset
-
-the chunk index (has to be 0)
-
-pointer to memory which should be copied to OPS
-:::
-
-# Tiling for Cache-blocking
-
-OPS has a code generation (ops_gen_mpi_lazy) and build target for
-tiling. Once compiled, to enable, use the `OPS_TILING` runtime parameter
-- this will look at the L3 cache size of your CPU and guess the correct
-tile size. If you want to alter the amount of cache to be used for the
-guess, use the `OPS_CACHE_SIZE=XX` runtime parameter, where the value is
-in Megabytes. To manually specify the tile sizes, use the
-OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments.
-
-When MPI is combined with OpenMP tiling can be extended to the MPI
-halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
-halos for multiple `ops_par_loops` can be exchanged with a single MPI
-message (see [@TPDS2017] for more details)\
-To test, compile CloverLeaf under `apps/c/CloverLeaf`, modify clover.in
-to use a $6144^2$ mesh, then run as follows:\
-For OpenMP with tiling:\
-`export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING`\
-For MPI+OpenMP with tiling:\
-`export OMP_NUM_THREADS=xx; mpirun -np xx ./cloverleaf_mpi_tiled OPS_TILING OPS_TILING_MAXDEPTH=6`\
-To manually specify the tile sizes (in number of grid points), use the
-OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:\
-`export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200 `
-
-# CUDA and OpenCL Runtime Arguments
-
-The CUDA (and OpenCL) thread block sizes can be controlled by setting
-the `OPS_BLOCK_SIZE_X, OPS_BLOCK_SIZE_Y` and `OPS_BLOCK_SIZE_Z` runtime
-arguments. For example :\
-`./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4`\
-`OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the
-code on.\
-Usually `OPS_CL_DEVICE=0` selects the CPU and `OPS_CL_DEVICE=1` selects
-GPUs.
-
-# Executing with GPUDirect
-
-GPU direct support for MPI+CUDA, to enable (on the OPS side) add
-**-gpudirect** when running the executable. You may also have to use
-certain environmental flags when using different MPI distributions. For
-an example of the required flags and environmental settings on the
-Cambridge Wilkes2 GPU cluster see:\
-<https://docs.hpc.cam.ac.uk/hpc/user-guide/performance-tips.html>
-
-# OPS User Kernels
-
-In OPS, the elemental operation carried out per mesh/grid point is
-specified as an outlined function called a *user kernel*. An example
-taken from the Cloverleaf application is given in Figure
-[\[fig:example\]](#fig:example){reference-type="ref"
-reference="fig:example"}.\
-
-``` {.cpp mathescape="" linenos="" startFrom="1" numbersep="0pt" gobble="2" frame="lines" framesep="1mm"}
-void accelerate_kernel( const ACC<double> &density0, const ACC<double> &volume,
-                ACC<double> &stepbymass, const ACC<double> &xvel0, ACC<double> &xvel1,
-                const ACC<double> &xarea, const ACC<double> &pressure,
-                const ACC<double> &yvel0, ACC<double> &yvel1,
-                const ACC<double> &yarea, const ACC<double> &viscosity) {
-
-  double nodal_mass;
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  nodal_mass = ( density0(-1,-1) * volume(-1,-1)
-    + density0(0,-1) * volume(0,-1)
-    + density0(0,0) * volume(0,0)
-    + density0(-1,0) * volume(-1,0) ) * 0.25;
-
-  stepbymass(0,0) = 0.5*dt/ nodal_mass;
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, 0,-1};
-
-  xvel1(0,0) = xvel0(0,0) - stepbymass(0,0) *
-            ( xarea(0,0)  * ( pressure(0,0) - pressure(-1,0) ) +
-              xarea(0,-1) * ( pressure(0,-1) - pressure(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, -1,0};
-
-  yvel1(0,0) = yvel0(0,0) - stepbymass(0,0) *
-            ( yarea(0,0)  * ( pressure(0,0) - pressure(0,-1) ) +
-              yarea(-1,0) * ( pressure(-1,0) - pressure(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, 0,-1};
-
-  xvel1(0,0) = xvel1(0,0) - stepbymass(0,0) *
-            ( xarea(0,0) * ( viscosity(0,0) - viscosity(-1,0) ) +
-              xarea(0,-1) * ( viscosity(0,-1) - viscosity(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, -1,0};
-
-  yvel1(0,0) = yvel1(0,0) - stepbymass(0,0) *
-            ( yarea(0,0) * ( viscosity(0,0) - viscosity(0,-1) ) +
-              yarea(-1,0) * ( viscosity(-1,0) - viscosity(-1,-1) ) );
-
-
-}
-```
-
-[\[fig:example\]]{#fig:example label="fig:example"}
-
-\
-\
-\
-\
-This user kernel is then used in an `ops_par_loop` (Figure
-[\[fig:parloop\]](#fig:parloop){reference-type="ref"
-reference="fig:parloop"}). The key aspect to note in the user kernel in
-Figure [\[fig:example\]](#fig:example){reference-type="ref"
-reference="fig:example"} is the use of the ACC\<\> objects and their
-parentheses operator. These specify the stencil in accessing the
-elements of the respective data arrays.
-
-``` {.cpp mathescape="" linenos="" startFrom="1" numbersep="0pt" gobble="2" frame="lines" framesep="2mm"}
-    int rangexy_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1};
-
-    ops_par_loop(accelerate_kernel, "accelerate_kernel", clover_grid, 2, rangexy_inner_plus1,
-     ops_arg_dat(density0, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(volume, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE),
-     ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_READ),
-     ops_arg_dat(xvel1, 1, S2D_00, "double", OPS_INC),
-     ops_arg_dat(xarea, 1, S2D_00_0M1, "double", OPS_READ),
-     ops_arg_dat(pressure, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_READ),
-     ops_arg_dat(yvel1, 1, S2D_00, "double", OPS_INC),
-     ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ),
-     ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ));
-```
-
-[\[fig:parloop\]]{#fig:parloop label="fig:parloop"}
-
-::: thebibliography
-1 OP2 for Many-Core Platforms, 2013.
-<http://www.oerc.ox.ac.uk/projects/op2>
-
-Istvan Z. Reguly, G.R. Mudalige, Mike B. Giles. Loop Tiling in
-Large-Scale Stencil Codes at Run-time with OPS. (2017) IEEE Transactions
-on Parallel and Distributed Systems.
-<http://dx.doi.org/10.1109/TPDS.2017.2778161>
-:::

From e827b814ed37eed8926c8a7e886449e5fb5faaba Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sat, 9 Oct 2021 23:55:19 +0100
Subject: [PATCH 254/324] Gitflow work flow model

---
 doc/devdoc.md | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index 5e906c5729..e959787cdd 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -1,11 +1,44 @@
-# Developer Guide 
+# Developer Guide
 ## Code-generator
 ### Frontend API parser
 ### Target Parallel Templates
 ### Elemental Kernel Transformations
 ## Back-end Library
 ### Sequential and multi-threaded CPU
-### MPI and Partitioning 
-### HDF5 
+### MPI and Partitioning
+### HDF5
 ### CUDA
 ### Cache blocking tiling and comm-avoiding optimizations
+## Git work flow for contribution
+To facilitate the concept of "Version" and "Release", we adopt the [Gitflow Workflow model](#https://nvie.com/posts/a-successful-git-branching-model/).
+### Overall work flow
+
+1. Create develop branch from main
+
+2. Create release branch from develop
+
+   After creating a release branch, only documentation and bug fixes will be added this branch.
+
+3. Create feature branches from develop
+
+4. Merge a feature branch into the develop branch once completed
+
+5. Merge release branch into develop and main once completed
+
+6. Create a hotfix branch from main if an issue is identified
+
+7. Merge a hotfix branch to both develop and main once fixed
+
+See also https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow.
+
+### A few issues
+Using the Gitflow model tends to produce a few long-live branches (e.g.,  feature), which may increase the risk of "conflicts" for intergration. To migrate this, we encourage the following practice
+
+* Try to create short-lived branches with a few small commites when possbile (e.g., a hotfix branch)
+* Once a branch properly merges or a feature finalised, delete the branch
+* A feature branch tends to be long-live, try to split a feature into "milestones" and merge into the develop branch when finishing each milestone.
+
+**The Gitflow tool will automatically delete a branch once it is finished.**
+### Gitflow tool
+
+see https://github.com/nvie/gitflow
\ No newline at end of file

From 34d79ed959cd62be0919c42eb862ab9a50401ea8 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 22:17:47 +0100
Subject: [PATCH 255/324] Try to build doxygen

---
 doc/conf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index c53bf5a553..3815b16821 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -13,7 +13,9 @@
 # import os
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
-
+import subprocess
+subprocess.call('doxygen doc/ops/Doxyfile', shell=True)
+html_extra_path = ['doc/ops/html']
 
 # -- Project information -----------------------------------------------------
 

From 50d0d8c2e21388901312fe62186bcac8ecc1dc02 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 22:24:18 +0100
Subject: [PATCH 256/324] Tune Doxygen Dir

---
 doc/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index 3815b16821..a4206b8354 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -14,8 +14,8 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 import subprocess
-subprocess.call('doxygen doc/ops/Doxyfile', shell=True)
-html_extra_path = ['doc/ops/html']
+subprocess.call('doxygen ops/Doxyfile', shell=True)
+html_extra_path = ['ops/html']
 
 # -- Project information -----------------------------------------------------
 

From f20750a0c1ba8ffa850bfd9cad2ea34a8a33315e Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 22:47:43 +0100
Subject: [PATCH 257/324] Try Copy Doxygen

---
 doc/conf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index a4206b8354..97b7b120d5 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -15,7 +15,8 @@
 # sys.path.insert(0, os.path.abspath('.'))
 import subprocess
 subprocess.call('doxygen ops/Doxyfile', shell=True)
-html_extra_path = ['ops/html']
+subprocess.call('cp ops/html/ . -r', shell=True)
+#html_extra_path = ['ops/html']
 
 # -- Project information -----------------------------------------------------
 

From ddd40b4bdccb12aba76cb7d0d93c47de6707443a Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 22:55:43 +0100
Subject: [PATCH 258/324] Not copy

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index 97b7b120d5..12886a4891 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -15,7 +15,7 @@
 # sys.path.insert(0, os.path.abspath('.'))
 import subprocess
 subprocess.call('doxygen ops/Doxyfile', shell=True)
-subprocess.call('cp ops/html/ . -r', shell=True)
+#subprocess.call('cp ops/html/ . -r', shell=True)
 #html_extra_path = ['ops/html']
 
 # -- Project information -----------------------------------------------------

From 7c78d8655b8107fa85380fc2d498d6a9fa9817fa Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 23:22:58 +0100
Subject: [PATCH 259/324] Stoo sphinx

---
 doc/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/conf.py b/doc/conf.py
index 12886a4891..ac5f42dbae 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -14,7 +14,7 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 import subprocess
-subprocess.call('doxygen ops/Doxyfile', shell=True)
+#subprocess.call('doxygen ops/Doxyfile', shell=True)
 #subprocess.call('cp ops/html/ . -r', shell=True)
 #html_extra_path = ['ops/html']
 

From 9b3a14cae9a3bf2c91a2e06df210f66523246a89 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 23:33:40 +0100
Subject: [PATCH 260/324] Try Gitlab for doxygen

---
 .gitlab-ci.yml | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ecb22b0399..264bfe1e29 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -255,19 +255,32 @@ test:Intel:
   only:
     - master
 
-#Stage "docs"
-make-docs:
-  stage: docs
-  when: always
+# #Stage "docs"
+# make-docs:
+#   stage: docs
+#   when: always
+#   tags:
+#     - CCP, test
+#   script:
+#     - cd doc && make all clean
+#   artifacts:
+#     expire_in: 4 week
+#     paths:
+#       - doc/user.pdf
+#       - doc/ops/html
+#       - doc/ops/latex/refman.pdf
+#       - doc/ops_translator/html
+#       - doc/ops_translator/latex/refman.pdf
+
+pages:
   tags:
     - CCP, test
   script:
-    - cd doc && make all clean
+  - cd doc
+  - doxygen ops/Doxyfile
+  - mv ops/html/ public/
   artifacts:
-    expire_in: 4 week
     paths:
-      - doc/user.pdf
-      - doc/ops/html
-      - doc/ops/latex/refman.pdf
-      - doc/ops_translator/html
-      - doc/ops_translator/latex/refman.pdf
\ No newline at end of file
+    - public
+  rules:
+    - if: $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH

From 7bfb76f17aac954b503819ef549aae5acaa58bbb Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 23:49:36 +0100
Subject: [PATCH 261/324] Add stage

---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 264bfe1e29..944b713e1a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -273,6 +273,7 @@ test:Intel:
 #       - doc/ops_translator/latex/refman.pdf
 
 pages:
+  stage: docs
   tags:
     - CCP, test
   script:

From cd2683f31871a3d857cf8e6f6116d6f7a80f23aa Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Sun, 10 Oct 2021 23:53:47 +0100
Subject: [PATCH 262/324] Adjust when to generate doxygen

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 944b713e1a..bbae0880a6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -274,6 +274,7 @@ test:Intel:
 
 pages:
   stage: docs
+  when: always
   tags:
     - CCP, test
   script:
@@ -283,5 +284,4 @@ pages:
   artifacts:
     paths:
     - public
-  rules:
-    - if: $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH
+

From 6570d96e72e1085a772c847c4bb6d9e576018b3d Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Mon, 11 Oct 2021 00:18:04 +0100
Subject: [PATCH 263/324] Correct public dir

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index bbae0880a6..d6c11f926a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -283,5 +283,5 @@ pages:
   - mv ops/html/ public/
   artifacts:
     paths:
-    - public
+    - doc/public
 

From 0730cef93c959b4a9af7c870dc8f3da87983b6f8 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Mon, 11 Oct 2021 09:12:39 +0100
Subject: [PATCH 264/324] Correct Dir

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d6c11f926a..239f41b92d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -280,8 +280,8 @@ pages:
   script:
   - cd doc
   - doxygen ops/Doxyfile
-  - mv ops/html/ public/
+  - mv ops/html/ $CI_PROJECT_DIR/public/
   artifacts:
     paths:
-    - doc/public
+    - public
 

From 6e0662522c9d47740e03d51ee7de018b973579a7 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Mon, 11 Oct 2021 09:46:15 +0100
Subject: [PATCH 265/324] Add doxygen comment link

---
 doc/opsapi.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 3b81698c27..1b40c04f7a 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -682,3 +682,6 @@ This routine copies the data given  by the user to the internal data structure u
 |dat|         the dataset|
 |part|        the chunk index (has to be 0)|
 |data|        pointer to memory which should be copied to OPS |
+
+### Doxygen comments for APIs
+We also provide Doxygen comments in for using APIs, please view [here](#https://op-dsl-ci.gitlab.io/ops-ci/).
\ No newline at end of file

From de4f14a71eba635f13c96c10bc4790d64bd46d27 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Mon, 11 Oct 2021 09:48:53 +0100
Subject: [PATCH 266/324] Repair the Doxgen link

---
 doc/opsapi.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 1b40c04f7a..71c4e11058 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -684,4 +684,4 @@ This routine copies the data given  by the user to the internal data structure u
 |data|        pointer to memory which should be copied to OPS |
 
 ### Doxygen comments for APIs
-We also provide Doxygen comments in for using APIs, please view [here](#https://op-dsl-ci.gitlab.io/ops-ci/).
\ No newline at end of file
+We also provide Doxygen comments in for using APIs, please view [here](https://op-dsl-ci.gitlab.io/ops-ci/).
\ No newline at end of file

From 8e01b4d9b809123d9e73a7cdef194ad2a09a00ca Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 11 Oct 2021 12:02:11 +0100
Subject: [PATCH 267/324] Update opsapi.md

---
 doc/opsapi.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 71c4e11058..b8c9a908e7 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -683,5 +683,5 @@ This routine copies the data given  by the user to the internal data structure u
 |part|        the chunk index (has to be 0)|
 |data|        pointer to memory which should be copied to OPS |
 
-### Doxygen comments for APIs
-We also provide Doxygen comments in for using APIs, please view [here](https://op-dsl-ci.gitlab.io/ops-ci/).
\ No newline at end of file
+## Doxygen
+Doxygen generated from OPS source can be found [here](https://op-dsl-ci.gitlab.io/ops-ci/).

From dab182d9f5cda3919ab921cd69d0b1d812c390ba Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 11 Oct 2021 12:14:28 +0100
Subject: [PATCH 268/324] Update installation.md

---
 doc/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index ec6acbc0c8..ae1f68b90e 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -144,4 +144,4 @@ make
 ```  
 <!---#### Makefile options -->
 
-## Runtime Flags and Options
+

From 7f54eb416c630442921d8ed35a4c09689c4ddaa2 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 11 Oct 2021 12:15:11 +0100
Subject: [PATCH 269/324] Update devanapp.md

---
 doc/devanapp.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 26a8f14339..cc4e304057 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -76,6 +76,7 @@ ops_par_loop(accelerate_kernel, "accelerate_kernel", clover_grid, 2, rangexy_inn
      ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ),
      ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ));
 ```
+## File I/O
 ## Supported Paralleizations
 ## Code-generation Flags
-## File I/O
+## Runtime Flags and Options

From 8d951ad5b3b8c3231b4de13fe42ef005cf89a6f9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 11 Oct 2021 12:17:20 +0100
Subject: [PATCH 270/324] Update devanapp.md

---
 doc/devanapp.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index cc4e304057..d9adc96451 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,7 +1,7 @@
 # Developing an OPS Application
 
 ## Tutorial
-##
+
 ## OPS User Kernels
 
 In OPS, the elemental operation carried out per mesh/grid point is

From d5c16d466316ca67ab5c30974cf16016d6792754 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 12:10:17 +0100
Subject: [PATCH 271/324] Update devanapp.md

---
 doc/devanapp.md | 91 +++++++++----------------------------------------
 1 file changed, 16 insertions(+), 75 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index d9adc96451..eb674cea26 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,82 +1,23 @@
 # Developing an OPS Application
 
-## Tutorial
 
-## OPS User Kernels
+## OPS Abstraction
+## Example Application
+## Original - Initialisation
+## Original - Boundary loops
+## Original - Main iteration
+## Build OPS
+## Step 1 - Preparing to use OPS
+## Step 2 - OPS declarations
+## Step 3 - First parallel loop
+## Step 4 - Indexes and global constants
+## Step 5 - Complex stencils and reductions
+## Step 6 - Handing it all to OPS
+## Step 7 - Code generation
+## Code generated versions
+## Optimizations - general
+## Optimizations - tiling
 
-In OPS, the elemental operation carried out per mesh/grid point is
-specified as an outlined function called a *user kernel*. An example
-taken from the Cloverleaf application is given below.
-
-```c++
-void accelerate_kernel( const ACC<double> &density0, const ACC<double> &volume,
-                ACC<double> &stepbymass, const ACC<double> &xvel0, ACC<double> &xvel1,
-                const ACC<double> &xarea, const ACC<double> &pressure,
-                const ACC<double> &yvel0, ACC<double> &yvel1,
-                const ACC<double> &yarea, const ACC<double> &viscosity) {
-
-  double nodal_mass;
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  nodal_mass = ( density0(-1,-1) * volume(-1,-1)
-    + density0(0,-1) * volume(0,-1)
-    + density0(0,0) * volume(0,0)
-    + density0(-1,0) * volume(-1,0) ) * 0.25;
-
-  stepbymass(0,0) = 0.5*dt/ nodal_mass;
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, 0,-1};
-
-  xvel1(0,0) = xvel0(0,0) - stepbymass(0,0) *
-            ( xarea(0,0)  * ( pressure(0,0) - pressure(-1,0) ) +
-              xarea(0,-1) * ( pressure(0,-1) - pressure(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, -1,0};
-
-  yvel1(0,0) = yvel0(0,0) - stepbymass(0,0) *
-            ( yarea(0,0)  * ( pressure(0,0) - pressure(0,-1) ) +
-              yarea(-1,0) * ( pressure(-1,0) - pressure(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, 0,-1};
-
-  xvel1(0,0) = xvel1(0,0) - stepbymass(0,0) *
-            ( xarea(0,0) * ( viscosity(0,0) - viscosity(-1,0) ) +
-              xarea(0,-1) * ( viscosity(0,-1) - viscosity(-1,-1) ) );
-
-  //{0,0, -1,0, 0,-1, -1,-1};
-  //{0,0, -1,0};
-
-  yvel1(0,0) = yvel1(0,0) - stepbymass(0,0) *
-            ( yarea(0,0) * ( viscosity(0,0) - viscosity(0,-1) ) +
-              yarea(-1,0) * ( viscosity(-1,0) - viscosity(-1,-1) ) );
-
-}
-```
-
-This user kernel is then used in an `ops_par_loop` function. The key aspect to note in the user kernel is the use of the ACC\<\> objects and their
-parentheses operator. These specify the stencil in accessing the
-elements of the respective data arrays.
-
-```c++
-int rangexy_inner_plus1[] = {x_min,x_max+1,y_min,y_max+1};
-
-ops_par_loop(accelerate_kernel, "accelerate_kernel", clover_grid, 2, rangexy_inner_plus1,
-     ops_arg_dat(density0, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(volume, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(work_array1, 1, S2D_00, "double", OPS_WRITE),
-     ops_arg_dat(xvel0, 1, S2D_00, "double", OPS_READ),
-     ops_arg_dat(xvel1, 1, S2D_00, "double", OPS_INC),
-     ops_arg_dat(xarea, 1, S2D_00_0M1, "double", OPS_READ),
-     ops_arg_dat(pressure, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ),
-     ops_arg_dat(yvel0, 1, S2D_00, "double", OPS_READ),
-     ops_arg_dat(yvel1, 1, S2D_00, "double", OPS_INC),
-     ops_arg_dat(yarea, 1, S2D_00_M10, "double", OPS_READ),
-     ops_arg_dat(viscosity, 1, S2D_00_M10_0M1_M1M1, "double", OPS_READ));
-```
-## File I/O
 ## Supported Paralleizations
 ## Code-generation Flags
 ## Runtime Flags and Options

From 35d3ee950d0a7391091fc3b4ce515fe346da1b90 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 12:22:03 +0100
Subject: [PATCH 272/324] Update devanapp.md

---
 doc/devanapp.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index eb674cea26..b6592298f6 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -1,7 +1,14 @@
 # Developing an OPS Application
-
+This page provides a tutorial in the basics of using OPS for multi-block structured mesh application development. This is taken from a [presentation](https://op-dsl.github.io/docs/OPS/tutorial.pdf) given initially in April 2018 and subsequently updated for the latest release of OPS. 
 
 ## OPS Abstraction
+OPS is a Domain Specific Language embedded in C/C++ and Fortran, targeting the development of multi-block structured mesh computations. The abstraction has two distinct components:  the definition of the mesh, and operations over the mesh.
+* Defining a number of 1-3D blocks, and on them a number of datasets, which have specific extents in the different dimensions.
+* Describing a parallel loop over a given block, with a given iteration range, executing a given "kernel function" at each grid point, and describing what datasets are going to be accessed and how.
+* Additionally, one needs to declare stencils (access patterns) that will be used in parallel loops to access datasets, and any global constants (read-only global scope variables)
+
+Data and computations expressed this way can be automatically managed and parallelised by the OPS library. Higher dimensions supported in the backend, but not currently by the code generators.
+
 ## Example Application
 ## Original - Initialisation
 ## Original - Boundary loops

From 58f7f273d2f9e2086bfc405098eb96cc2748828d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 12:26:10 +0100
Subject: [PATCH 273/324] Update devanapp.md

---
 doc/devanapp.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index b6592298f6..7ba242e157 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -7,9 +7,16 @@ OPS is a Domain Specific Language embedded in C/C++ and Fortran, targeting the d
 * Describing a parallel loop over a given block, with a given iteration range, executing a given "kernel function" at each grid point, and describing what datasets are going to be accessed and how.
 * Additionally, one needs to declare stencils (access patterns) that will be used in parallel loops to access datasets, and any global constants (read-only global scope variables)
 
-Data and computations expressed this way can be automatically managed and parallelised by the OPS library. Higher dimensions supported in the backend, but not currently by the code generators.
+Data and computations expressed this way can be automatically managed and parallelised by the OPS library. Higher dimensions are supported in the backend, but not currently by the code generators.
 
 ## Example Application
+In this tutorial we will use an example application, a simple 2D iterative Laplace equation solver. 
+* Go to the `OPS/apps/c/laplace2dtutorial/original` directory
+* Open the `laplace2d.cpp` file
+* It uses an $imax x jmax$ grid, with an additional 1 layers of boundary cells on all sides
+* There are a number of loops that set the boundary conditions along the four edges
+* The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
+* Compile and run the code !
 ## Original - Initialisation
 ## Original - Boundary loops
 ## Original - Main iteration

From a2025bea98db386c453d52e44727633c65961ce9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 12:32:20 +0100
Subject: [PATCH 274/324] Update devanapp.md

---
 doc/devanapp.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 7ba242e157..591d4dbccc 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -17,7 +17,30 @@ In this tutorial we will use an example application, a simple 2D iterative Lapla
 * There are a number of loops that set the boundary conditions along the four edges
 * The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
 * Compile and run the code !
+
 ## Original - Initialisation
+The original code begins with initializing the data arrays used in the calculation. 
+```
+// Size  along  y
+int jmax = 4094;
+// Size  along  x
+int imax = 4094;
+
+int itermax = 100;
+double pi = 2.0∗asin(1.0);
+const double tol = 1.0e−6;
+double error = 1.0;
+
+double ∗A;
+double ∗Anew;
+double ∗y0;
+
+A    = (double ∗)malloc ((imax+2)∗(jmax+2)∗sizeof(double));
+Anew = (double ∗)malloc ((imax+2)∗(jmax+2)∗sizeof(double));
+y0   = (double ∗)malloc ((imax+2)∗sizeof(double));
+
+memset(A, 0, (imax+2)∗(jmax+2)∗sizeof(double));
+```
 ## Original - Boundary loops
 ## Original - Main iteration
 ## Build OPS

From 2b7b397d749bcaa1ee7fcbafe579e5ad86a3e90f Mon Sep 17 00:00:00 2001
From: Istvan Reguly <regulyistvan@gmail.com>
Date: Tue, 12 Oct 2021 13:45:12 +0200
Subject: [PATCH 275/324] Update SYCL/HIP

---
 doc/installation.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index ae1f68b90e..97031296b2 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -32,10 +32,12 @@ The [CUDA](https://developer.nvidia.com/cuda-downloads) backend targets NVIDIA G
 
 **HIP**
 
-The HIP backend targets AMD GPUs which are supported by the ROCm stack
+The HIP backend targets AMD GPUs and NVIDIA GPUs which are supported by HIP - either through its CUDA support or the ROCm stack (tested with >=3.9). 
 
 **SYCL**
 
+The SYCL backend is currently in development and only working without MPI. It has been tested with Intel OneAPI (>=2021.1), Intel's public LLVM version, and hipSYCL (>=0.9.1), and runs on Intel CPUs and GPUs through Intel's OpenCL and Level Zero, NVIDIA and AMD GPUs both with the LLVM fork as well as hipSYCL. hipSYCL's OpenMP support covers most CPU architectures too.
+
 **Tridiagonal Solver**
 
 To use the tridiagonal solver OPS API in applications and build example applications such as `adi`, `adi_burger` and `adi_burger_3D` the open source tridiagonal solver (scalar) library needs to be cloned and built from the [Tridsolver repository](https://github.com/OP-DSL/tridsolver). 

From 03568b711fc93758b1c47cde0ea627aa2121cd5a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 12:57:03 +0100
Subject: [PATCH 276/324] Update devanapp.md

---
 doc/devanapp.md | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 591d4dbccc..ef674b9df8 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -13,13 +13,13 @@ Data and computations expressed this way can be automatically managed and parall
 In this tutorial we will use an example application, a simple 2D iterative Laplace equation solver. 
 * Go to the `OPS/apps/c/laplace2dtutorial/original` directory
 * Open the `laplace2d.cpp` file
-* It uses an $imax x jmax$ grid, with an additional 1 layers of boundary cells on all sides
+* It uses an $imax$x$jmax$ grid, with an additional 1 layers of boundary cells on all sides
 * There are a number of loops that set the boundary conditions along the four edges
 * The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
 * Compile and run the code !
 
 ## Original - Initialisation
-The original code begins with initializing the data arrays used in the calculation. 
+The original code begins with initializing the data arrays used in the calculation:
 ```
 // Size  along  y
 int jmax = 4094;
@@ -42,7 +42,46 @@ y0   = (double ∗)malloc ((imax+2)∗sizeof(double));
 memset(A, 0, (imax+2)∗(jmax+2)∗sizeof(double));
 ```
 ## Original - Boundary loops
+The application sen sets boundary conditions:
+```
+for (int i = 0; i < imax+2; i++)
+    A[(0)*(imax+2)+i]   = 0.0;
+
+for (int i = 0; i < imax+2; i++)
+    A[(jmax+1)*(imax+2)+i] = 0.0;
+
+for (int j = 0; j < jmax+2; j++) {
+    A[(j)*(imax+2)+0] = sin(pi * j / (jmax+1));
+}
+
+for (int j = 0; j < imax+2; j++) {
+    A[(j)*(imax+2)+imax+1] = sin(pi * j / (jmax+1))*exp(-pi);
+}
+```  
+Note how in the latter two loops the loop index is used.
+
 ## Original - Main iteration
+The main iterative loop is a while loop iterating until the error tolarance is at a set level and the number of iterations are les than the maximum set. 
+```
+while ( error > tol && iter < iter_max ) {
+  error = 0.0;
+  for( int j = 1; j < jmax+1; j++ ) {
+    for( int i = 1; i < imax+1; i++) {
+      Anew[(j)*(imax+2)+i] = 0.25f * 
+      ( A[(j)*(imax+2)+i+1] + A[(j)*(imax+2)+i-1]
+      + A[(j-1)*(imax+2)+i] + A[(j+1)*(imax+2)+i]);
+      error = fmax( error, fabs(Anew[(j)*(imax+2)+i]-A[(j)*(imax+2)+i]));
+      }
+    }
+    for( int j = 1; j < jmax+1; j++ ) {
+      for( int i = 1; i < imax+1; i++) {
+        A[(j)*(imax+2)+i] = Anew[(j)*(imax+2)+i];    
+      }
+    }
+    if(iter % 10 == 0) printf("%5d, %0.6f\n", iter, error);        
+    iter++;
+  }
+  ```
 ## Build OPS
 ## Step 1 - Preparing to use OPS
 ## Step 2 - OPS declarations

From aaf61dea89eeb649af5725fc8453b9d3ca3b9e34 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 13:17:34 +0100
Subject: [PATCH 277/324] Update devanapp.md

---
 doc/devanapp.md | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index ef674b9df8..468c5c915d 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -21,25 +21,26 @@ In this tutorial we will use an example application, a simple 2D iterative Lapla
 ## Original - Initialisation
 The original code begins with initializing the data arrays used in the calculation:
 ```
-// Size  along  y
+//Size along y
 int jmax = 4094;
-// Size  along  x
+//Size along x
 int imax = 4094;
+//Size along x
+int iter_max = 100;
 
-int itermax = 100;
-double pi = 2.0∗asin(1.0);
-const double tol = 1.0e−6;
-double error = 1.0;
+double pi  = 2.0 * asin(1.0);
+const double tol = 1.0e-6;
+double error     = 1.0;
 
-double ∗A;
-double ∗Anew;
-double ∗y0;
+double *A;
+double *Anew;
+double *y0;
 
-A    = (double ∗)malloc ((imax+2)∗(jmax+2)∗sizeof(double));
-Anew = (double ∗)malloc ((imax+2)∗(jmax+2)∗sizeof(double));
-y0   = (double ∗)malloc ((imax+2)∗sizeof(double));
+A    = (double *)malloc((imax+2) * (jmax+2) * sizeof(double));
+Anew = (double *)malloc((imax+2) * (jmax+2) * sizeof(double));
+y0   = (double *)malloc((imax+2) * sizeof(double));
 
-memset(A, 0, (imax+2)∗(jmax+2)∗sizeof(double));
+memset(A, 0, (imax+2) * (jmax+2) * sizeof(double));
 ```
 ## Original - Boundary loops
 The application sen sets boundary conditions:

From 9aea9bbb0382a36e58ad9d55c920b9c7c0345b89 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 13:20:33 +0100
Subject: [PATCH 278/324] Update devanapp.md

---
 doc/devanapp.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 468c5c915d..0cb450eb45 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -72,18 +72,20 @@ while ( error > tol && iter < iter_max ) {
       ( A[(j)*(imax+2)+i+1] + A[(j)*(imax+2)+i-1]
       + A[(j-1)*(imax+2)+i] + A[(j+1)*(imax+2)+i]);
       error = fmax( error, fabs(Anew[(j)*(imax+2)+i]-A[(j)*(imax+2)+i]));
-      }
     }
-    for( int j = 1; j < jmax+1; j++ ) {
-      for( int i = 1; i < imax+1; i++) {
-        A[(j)*(imax+2)+i] = Anew[(j)*(imax+2)+i];    
-      }
+  }
+  for( int j = 1; j < jmax+1; j++ ) {
+    for( int i = 1; i < imax+1; i++) {
+      A[(j)*(imax+2)+i] = Anew[(j)*(imax+2)+i];    
     }
-    if(iter % 10 == 0) printf("%5d, %0.6f\n", iter, error);        
-    iter++;
   }
-  ```
+  if(iter % 10 == 0) printf("%5d, %0.6f\n", iter, error);        
+  iter++;
+}
+```
 ## Build OPS
+Build OPS using instructions in the [Getting Started](https://ops-dsl.readthedocs.io/en/markdowndocdev/installation.html#getting-started) page. 
+
 ## Step 1 - Preparing to use OPS
 ## Step 2 - OPS declarations
 ## Step 3 - First parallel loop

From cf2b11b311eea20e1f9bf6aab4b9a8ccf8e4409a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 13:28:57 +0100
Subject: [PATCH 279/324] Update devanapp.md

---
 doc/devanapp.md | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 0cb450eb45..9298fd44d7 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -13,7 +13,7 @@ Data and computations expressed this way can be automatically managed and parall
 In this tutorial we will use an example application, a simple 2D iterative Laplace equation solver. 
 * Go to the `OPS/apps/c/laplace2dtutorial/original` directory
 * Open the `laplace2d.cpp` file
-* It uses an $imax$x$jmax$ grid, with an additional 1 layers of boundary cells on all sides
+* It uses an $imax$ x $jmax$ grid, with an additional 1 layers of boundary cells on all sides
 * There are a number of loops that set the boundary conditions along the four edges
 * The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
 * Compile and run the code !
@@ -87,6 +87,24 @@ while ( error > tol && iter < iter_max ) {
 Build OPS using instructions in the [Getting Started](https://ops-dsl.readthedocs.io/en/markdowndocdev/installation.html#getting-started) page. 
 
 ## Step 1 - Preparing to use OPS
+Firstly, include the appropriate header files, then initialise OPS, and at the end finalise it.
+* Define that this application is 2D, include the OPS header file, and create a header file where the outlined "elemental kernels" will live.
+```
+#define OPS_2D
+#include <ops_seq.h>
+#include "laplace_kernels.h" 
+```
+* Initialise and finalise OPS
+```  
+int main(int argc, const char** argv) {
+  //Initialise the OPS library, passing runtime args, and setting diagnostics level to low (1)
+  ops_init(argc, argv,1);
+  ...
+  ...
+  //Finalising the OPS library
+  ops_exit();
+}  
+```  
 ## Step 2 - OPS declarations
 ## Step 3 - First parallel loop
 ## Step 4 - Indexes and global constants

From 9fd55c9f56169c06a6f84f0f6969a722f79173a9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 13:31:15 +0100
Subject: [PATCH 280/324] Update devanapp.md

---
 doc/devanapp.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 9298fd44d7..31a785f549 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -105,6 +105,8 @@ int main(int argc, const char** argv) {
   ops_exit();
 }  
 ```  
+By this point you need OPS set up - take a look at the Makefile in step1, and observ that the include and library paths are added, and we link against `ops_seq`.
+
 ## Step 2 - OPS declarations
 ## Step 3 - First parallel loop
 ## Step 4 - Indexes and global constants

From 3102b59ddd2c42b17780e49b7d4565649d760061 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 14:14:24 +0100
Subject: [PATCH 281/324] Update devanapp.md

---
 doc/devanapp.md | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 31a785f549..4a35bce518 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -105,9 +105,38 @@ int main(int argc, const char** argv) {
   ops_exit();
 }  
 ```  
-By this point you need OPS set up - take a look at the Makefile in step1, and observ that the include and library paths are added, and we link against `ops_seq`.
+By this point you need OPS set up - take a look at the Makefile in step1, and observe that the include and library paths are added, and we link against `ops_seq`.
 
 ## Step 2 - OPS declarations
+Now declare a block and data on the block :
+```
+//The 2D block
+ops_block block = ops_decl_block(2, "my_grid");
+
+//The two datasets
+int size[] = {imax, jmax};
+int base[] = {0,0};
+int d_m[] = {-1,-1};
+int d_p[] = {1,1};
+ops_dat d_A    = ops_decl_dat(block, 1, size, base,
+                               d_m, d_p, A,    "double", "A");
+ops_dat d_Anew = ops_decl_dat(block, 1, size, base,
+                               d_m, d_p, Anew, "double", "Anew");
+```
+Data sets have a size (number of mesh points in each dimension). There is passing for halos or boundaries in the positive (`d_p`) and negative directions (`d_m`). Here we use a 1 thick boundary layer. Base index can be defined as it may be different from 0 (e.g. in Fortran). Item these with a 0 base index and a 1 wide halo, these datasets can be indexed from −1 tosize +1.
+
+OPS supports gradual conversion of applications to its API, but in this case the described data sizes will need to match:  the allocated memory and its extents need to be correctly described to OPS. In this example we have two `(imax+ 2) ∗ (jmax+ 2)` size arrays, and the total size in each dimension needs to matchsize `[i] + d_p[i] − d_m[i]`.  This is only supported for the sequential and OpenMP backends. If a `NULL` pointer is passed, OPS will allocate the data internally.
+
+We also need to declare the stencils that will be used - in this example most loops use a simple 1-point stencil, and one uses a 5-point stencil:
+```
+//Two stencils, a 1-point, and a 5-point
+int s2d_00[] = {0,0};
+ops_stencil S2D_00 = ops_decl_stencil(2,1,s2d_00,"0,0");
+int s2d_5pt[] = {0,0, 1,0, -1,0, 0,1, 0,-1};
+ops_stencil S2D_5pt = ops_decl_stencil(2,5,s2d_5pt,"5pt");
+```  
+Different names may be used for stencils in your code, but we suggest using some convention.
+
 ## Step 3 - First parallel loop
 ## Step 4 - Indexes and global constants
 ## Step 5 - Complex stencils and reductions

From d5dfa21198b836b28306f7193e8b3e7d98d81ff3 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 14:25:17 +0100
Subject: [PATCH 282/324] Update devanapp.md

---
 doc/devanapp.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 4a35bce518..d62385c434 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -138,6 +138,27 @@ ops_stencil S2D_5pt = ops_decl_stencil(2,5,s2d_5pt,"5pt");
 Different names may be used for stencils in your code, but we suggest using some convention.
 
 ## Step 3 - First parallel loop
+You can now convert the first loop to use OPS:
+```
+for (int i = 0; i < imax+2; i++)
+    A[(0)*(imax+2)+i]   = 0.0;
+```    
+This is a loop on the ottom boundary of the domain, which is at the −1 index for our dataset, therefore our iteration range will be over the entire domain, including halos in the X direction, and the bottom boundary in the Y direction.  The iteration range is given as beginning (inclusive) and end (exclusive) indices in the x, y, etc.  directions.
+```
+int bottom_range[] = {-1, imax+1, -1, 0};
+```
+Next, we need to outline the “elemental” into `laplacekernels.h`, and place the appropriate access objects - `ACC<double> &A`, in the kernel’s formal parameter list, and `(i,j)` are the stencil offsets in the X and Y directions respectively:
+```
+void set_zero(ACC<double> &A) {
+  A(0,0) = 0.0;
+}
+```
+The OPS parallel loop can now be written as follows:
+```
+ops_par_loop(set_zero, "set_zero", block, 2, bottom_range,
+      ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE));
+```
+The loop will execute `set_zero` at each mesh point defined in the iteration range, and write the dataset `d_A` with the 1-point stencil. The `ops_par_loop` implies that the order in which grid points will be executed will not affect the end result (within machine precision).
 ## Step 4 - Indexes and global constants
 ## Step 5 - Complex stencils and reductions
 ## Step 6 - Handing it all to OPS

From ac554f8f941aa178b6ade2bf3e0ff6d7c3a51317 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 14:26:03 +0100
Subject: [PATCH 283/324] Update devanapp.md

---
 doc/devanapp.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index d62385c434..f0c5d1fa44 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -4,7 +4,7 @@ This page provides a tutorial in the basics of using OPS for multi-block structu
 ## OPS Abstraction
 OPS is a Domain Specific Language embedded in C/C++ and Fortran, targeting the development of multi-block structured mesh computations. The abstraction has two distinct components:  the definition of the mesh, and operations over the mesh.
 * Defining a number of 1-3D blocks, and on them a number of datasets, which have specific extents in the different dimensions.
-* Describing a parallel loop over a given block, with a given iteration range, executing a given "kernel function" at each grid point, and describing what datasets are going to be accessed and how.
+* Describing a parallel loop over a given block, with a given iteration range, executing a given "kernel function" at each mesh point, and describing what datasets are going to be accessed and how.
 * Additionally, one needs to declare stencils (access patterns) that will be used in parallel loops to access datasets, and any global constants (read-only global scope variables)
 
 Data and computations expressed this way can be automatically managed and parallelised by the OPS library. Higher dimensions are supported in the backend, but not currently by the code generators.
@@ -13,7 +13,7 @@ Data and computations expressed this way can be automatically managed and parall
 In this tutorial we will use an example application, a simple 2D iterative Laplace equation solver. 
 * Go to the `OPS/apps/c/laplace2dtutorial/original` directory
 * Open the `laplace2d.cpp` file
-* It uses an $imax$ x $jmax$ grid, with an additional 1 layers of boundary cells on all sides
+* It uses an $imax$ x $jmax$ mesh, with an additional 1 layers of boundary cells on all sides
 * There are a number of loops that set the boundary conditions along the four edges
 * The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
 * Compile and run the code !
@@ -158,7 +158,7 @@ The OPS parallel loop can now be written as follows:
 ops_par_loop(set_zero, "set_zero", block, 2, bottom_range,
       ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE));
 ```
-The loop will execute `set_zero` at each mesh point defined in the iteration range, and write the dataset `d_A` with the 1-point stencil. The `ops_par_loop` implies that the order in which grid points will be executed will not affect the end result (within machine precision).
+The loop will execute `set_zero` at each mesh point defined in the iteration range, and write the dataset `d_A` with the 1-point stencil. The `ops_par_loop` implies that the order in which mesh points will be executed will not affect the end result (within machine precision).
 ## Step 4 - Indexes and global constants
 ## Step 5 - Complex stencils and reductions
 ## Step 6 - Handing it all to OPS

From d437cdc71b409e32e712c3b4635fd1895d7b4427 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 14:30:14 +0100
Subject: [PATCH 284/324] Update devanapp.md

---
 doc/devanapp.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index f0c5d1fa44..6eb4d65411 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -159,6 +159,20 @@ ops_par_loop(set_zero, "set_zero", block, 2, bottom_range,
       ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE));
 ```
 The loop will execute `set_zero` at each mesh point defined in the iteration range, and write the dataset `d_A` with the 1-point stencil. The `ops_par_loop` implies that the order in which mesh points will be executed will not affect the end result (within machine precision).
+
+There are three more loops which set values to zero, they can be trivially replaced with the code above, only altering the iteration range. In the main while loop, the second simpler loop simply copies data from one array to another, this time on the interior of the domain:
+```
+int interior_range[] = {0,imax,0,jmax};
+ops_par_loop(copy, "copy", block, 2, interior_range,
+    ops_arg_dat(d_A,    1, S2D_00, "double", OPS_WRITE),
+    ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_READ));
+```
+And the corresponding outlined elemental kernel is as follows:
+```
+void copy(ACC<double> &A, const ACC<double> &Anew) {
+  A(0,0) = Anew(0,0);
+}
+```
 ## Step 4 - Indexes and global constants
 ## Step 5 - Complex stencils and reductions
 ## Step 6 - Handing it all to OPS

From fdcbeabcd478947454445e8a5f5a589aa3f05c95 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 15:57:12 +0100
Subject: [PATCH 285/324] Update devanapp.md

---
 doc/devanapp.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 6eb4d65411..45342263e3 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -174,6 +174,31 @@ void copy(ACC<double> &A, const ACC<double> &Anew) {
 }
 ```
 ## Step 4 - Indexes and global constants
+There are two sets of boundary loops which use the loop variable j - this is a common technique to initialise data, such as coordinates `(x = i∗dx)`. OPS has a special argument `ops_arg_idx` which gives us a globally coherent (including over MPI) iteration index - between the bounds supplied in the iteration range.
+```
+ops_par_loop(left_bndcon, "left_bndcon", block, 2, left_range,
+      ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE),
+      ops_arg_idx());
+```
+And the corresponding outlined user kernel is as follows.  Observe the `idx` argument and the +1 offset due to the difference in indexing:
+```
+void left_bndcon(ACC<double> &A, const int *idx) {
+  A(0,0) = sin(pi * (idx[1]+1) / (jmax+1));
+}
+```
+This kernel also uses two variables,`jmax` and `pi` that do not depend on the iteration index - they are iteration space invariant.  OPS has two ways of supporting this:
+1. Global scope constants, through `ops_decl_const`, as done in this example: we need to move the declaration of the `imax`,`jmax` and `pi` variables to global scope (outside of main), and call the OPS API:
+```
+//declare and define global constants
+ops_decl_const("imax",1,"int",&imax);
+ops_decl_const("jmax",1,"int",&jmax);
+ops_decl_const("pi",1,"double",&pi);
+```
+These ariables do not need to be passed in to the elemental kernel, they are accessible in all elemental kernels.
+
+2. The other option is to explicitly pass it to the elemental kernel with `ops_arg_gbl`:  this is for scalars and small arrays that should not be in global scope.
+
+
 ## Step 5 - Complex stencils and reductions
 ## Step 6 - Handing it all to OPS
 ## Step 7 - Code generation

From 7a5a3edb26410c65ed629ed5a682be15a6bd27a9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 16:17:51 +0100
Subject: [PATCH 286/324] Update devanapp.md

---
 doc/devanapp.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 45342263e3..01adc41505 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -187,6 +187,7 @@ void left_bndcon(ACC<double> &A, const int *idx) {
 }
 ```
 This kernel also uses two variables,`jmax` and `pi` that do not depend on the iteration index - they are iteration space invariant.  OPS has two ways of supporting this:
+
 1. Global scope constants, through `ops_decl_const`, as done in this example: we need to move the declaration of the `imax`,`jmax` and `pi` variables to global scope (outside of main), and call the OPS API:
 ```
 //declare and define global constants
@@ -200,6 +201,21 @@ These ariables do not need to be passed in to the elemental kernel, they are acc
 
 
 ## Step 5 - Complex stencils and reductions
+There is only one loop left, which uses a 5 point stencil and a reduction.  It can be outlined as usual, and for the stencil, we will use `S2Dpt5`.
+```
+ops_par_loop(apply_stencil, "apply_stencil", block, 2, interior_range,
+        ops_arg_dat(d_A,    1, S2D_5pt, "double", OPS_READ),
+        ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE),
+        ops_arg_reduce(h_err, 1, "double", OPS_MAX))
+```
+And the corresponding outlined elemental kernel is as follows.  Observe the stencil offsets used to access the adjacent 4 points:
+```
+void apply_stencil(const ACC<double> &A, ACC<double> &Anew, double *error) {
+  Anew(0,0) = 0.25f * ( A(1,0) + A(-1,0)
+      + A(0,-1) + A(0,1));
+  *error = fmax( *error, fabs(Anew(0,0)-A(0,0)));
+}
+```
 ## Step 6 - Handing it all to OPS
 ## Step 7 - Code generation
 ## Code generated versions

From f1d2d665b8020a3cd98ae478273ad0f202433f2f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 16:53:24 +0100
Subject: [PATCH 287/324] Update devanapp.md

---
 doc/devanapp.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 01adc41505..8ca7c83cfe 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -188,7 +188,7 @@ void left_bndcon(ACC<double> &A, const int *idx) {
 ```
 This kernel also uses two variables,`jmax` and `pi` that do not depend on the iteration index - they are iteration space invariant.  OPS has two ways of supporting this:
 
-1. Global scope constants, through `ops_decl_const`, as done in this example: we need to move the declaration of the `imax`,`jmax` and `pi` variables to global scope (outside of main), and call the OPS API:
+1) Global scope constants, through `ops_decl_const`, as done in this example: we need to move the declaration of the `imax`,`jmax` and `pi` variables to global scope (outside of main), and call the OPS API:
 ```
 //declare and define global constants
 ops_decl_const("imax",1,"int",&imax);
@@ -197,7 +197,7 @@ ops_decl_const("pi",1,"double",&pi);
 ```
 These ariables do not need to be passed in to the elemental kernel, they are accessible in all elemental kernels.
 
-2. The other option is to explicitly pass it to the elemental kernel with `ops_arg_gbl`:  this is for scalars and small arrays that should not be in global scope.
+2) The other option is to explicitly pass it to the elemental kernel with `ops_arg_gbl`:  this is for scalars and small arrays that should not be in global scope.
 
 
 ## Step 5 - Complex stencils and reductions

From d1d10460a7e79dbf4516eaac673ed3c097daa51a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 17:06:38 +0100
Subject: [PATCH 288/324] Update devanapp.md

---
 doc/devanapp.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 8ca7c83cfe..2465204f5d 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -216,6 +216,13 @@ void apply_stencil(const ACC<double> &A, ACC<double> &Anew, double *error) {
   *error = fmax( *error, fabs(Anew(0,0)-A(0,0)));
 }
 ```
+The loop also has a special argument for the reduction, `ops_arg_reduce`.  As the first argument, it takes a reduction handle, which has to be defined separately:
+```
+//Reduction handle
+ops_reduction h_err = ops_decl_reduction_handle(sizeof(double), "double", "error");
+``` 
+Reductions may be increment (`OPS_INC`), min (`OPS_MIN`) or max(`OPS_MAX`). The user kernel will have to perform the reduction operation, reducing the passed in value as well as the computed value.
+
 ## Step 6 - Handing it all to OPS
 ## Step 7 - Code generation
 ## Code generated versions

From aaeaa26deb55729cdaa6a4e01a538e1de1514476 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 17:45:44 +0100
Subject: [PATCH 289/324] Update devanapp.md

---
 doc/devanapp.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 2465204f5d..f0bb88a59a 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -221,7 +221,15 @@ The loop also has a special argument for the reduction, `ops_arg_reduce`.  As th
 //Reduction handle
 ops_reduction h_err = ops_decl_reduction_handle(sizeof(double), "double", "error");
 ``` 
-Reductions may be increment (`OPS_INC`), min (`OPS_MIN`) or max(`OPS_MAX`). The user kernel will have to perform the reduction operation, reducing the passed in value as well as the computed value.
+Reductions may be increment (`OPS_INC`), min (`OPS_MIN`) or max (`OPS_MAX`). The user kernel will have to perform the reduction operation, reducing the passed in value as well as the computed value.
+
+The result of the reduction can be queried from the handle as follows:
+```
+ ops_reduction_result(h_err, &error);
+```
+
+Multiple parallel loops may use the same handle, and their results will be combined, until the result is queried by the user.  Parallel loops that only have the reduction handle in common are semantically independent.
+
 
 ## Step 6 - Handing it all to OPS
 ## Step 7 - Code generation

From 539c911234ef47c51f9d0aa5e3bcd1a01c0b456a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 12 Oct 2021 17:47:30 +0100
Subject: [PATCH 290/324] Update devanapp.md

---
 doc/devanapp.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index f0bb88a59a..4e1bd73308 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -218,7 +218,6 @@ void apply_stencil(const ACC<double> &A, ACC<double> &Anew, double *error) {
 ```
 The loop also has a special argument for the reduction, `ops_arg_reduce`.  As the first argument, it takes a reduction handle, which has to be defined separately:
 ```
-//Reduction handle
 ops_reduction h_err = ops_decl_reduction_handle(sizeof(double), "double", "error");
 ``` 
 Reductions may be increment (`OPS_INC`), min (`OPS_MIN`) or max (`OPS_MAX`). The user kernel will have to perform the reduction operation, reducing the passed in value as well as the computed value.

From 3fd6864f11854ce6ed42b79fdf3cbedd4bad346a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 14:03:50 +0100
Subject: [PATCH 291/324] Update devanapp.md

---
 doc/devanapp.md | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 4e1bd73308..dbbbcf063f 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -229,10 +229,34 @@ The result of the reduction can be queried from the handle as follows:
 
 Multiple parallel loops may use the same handle, and their results will be combined, until the result is queried by the user.  Parallel loops that only have the reduction handle in common are semantically independent.
 
-
 ## Step 6 - Handing it all to OPS
+
+We have now successfully converted all computations on the mesh to OPS parallel loops. In order for OPS to manage data and parallelisations better, we should let OPS allocate the datasets - instead of passing in the pointers to memory allocated by us, we just pass in NULL (`A` and `Anew`). Parallel I/O can be done using HDF5 - see the ops_hdf5.h header.
+
+All data and parallelisation is now handed to OPS. We can now also compile the developer MPI version of the code - see the Makefile, and try building `laplace2d_mpi`.
+
 ## Step 7 - Code generation
+
+Now that the developer versions of our code work, it’s time to generate code.  On the console, type: 
+```
+$OPSINSTALLPATH/../ops_translator/c/ops.py laplace2d.cpp
+```
+We have provided a Makefile which can use several different compilers (intel, cray, pgi, clang), we suggest modifying it for your own applications. Try building CUDA, OpenMP, MPI+CUDA, MPI+OpenMP, and other versions of the code. You can take a look at the generated kernels for different parallelisations under the appropriate subfolders. 
+
+If you add the−`OPS_DIAGS=2` runtime flag, at the end of execution, OPS will report timings and achieved bandwidth for each of your kernels. For more options, see the user guide.
+
+
 ## Code generated versions
+OPS will generate and compile a large number of different versions.
+* `laplace2d_dev_seq` and `laplace2d_dev_mpi` :  these do not use code generation, they are intended for development only
+* `laplace2d_seq` and `laplace2d_mpi` : baseline sequential and MPI implementations
+* `laplace2d_openmp` : baseline OpenMP implementation
+* `laplace2d_cuda`, `laplace2d_opencl`, `laplace2d_openacc` : implementations targeting GPUs 
+* `laplace2d_mpiinline` : optimised implementation with MPI+OpenMP
+* `laplace2d_tiled`: optimised implementation with OpenMP that improves spatial and temporal locality
+
+
+
 ## Optimizations - general
 ## Optimizations - tiling
 

From 83d37da00855febda27debd3abd1bd46713a36fe Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 14:17:15 +0100
Subject: [PATCH 292/324] Update devanapp.md

---
 doc/devanapp.md | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index dbbbcf063f..6179f2693c 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -243,7 +243,7 @@ $OPSINSTALLPATH/../ops_translator/c/ops.py laplace2d.cpp
 ```
 We have provided a Makefile which can use several different compilers (intel, cray, pgi, clang), we suggest modifying it for your own applications. Try building CUDA, OpenMP, MPI+CUDA, MPI+OpenMP, and other versions of the code. You can take a look at the generated kernels for different parallelisations under the appropriate subfolders. 
 
-If you add the−`OPS_DIAGS=2` runtime flag, at the end of execution, OPS will report timings and achieved bandwidth for each of your kernels. For more options, see the user guide.
+If you add the−`OPS_DIAGS=2` runtime flag, at the end of execution, OPS will report timings and achieved bandwidth for each of your kernels. For more options, see [Runtime Flags and Options](https://ops-dsl.readthedocs.io/en/markdowndocdev/devanapp.html#runtime-flags-and-options).
 
 
 ## Code generated versions
@@ -255,11 +255,24 @@ OPS will generate and compile a large number of different versions.
 * `laplace2d_mpiinline` : optimised implementation with MPI+OpenMP
 * `laplace2d_tiled`: optimised implementation with OpenMP that improves spatial and temporal locality
 
+## Optimizations - general
+Try the following performance tuning options
+* `laplace2d_cuda`, `laplace2d_opencl` : you can set the `OPS_BLOCK_SIZE_X` and `OPS_BLOCK_SIZE_Y` runtime arguments to control thread block or work group sizes 
+* `laplace2d_mpi_cuda`, `laplace2d_mpi_openacc` : add the `-gpudirect` runtime flag to enable GPU Direct communications
 
 
-## Optimizations - general
 ## Optimizations - tiling
 
+Tiling uses lazy execution: as parallel loops follow one another, they are not executed, but put in a queue, and only once some data needs to be returned to the user (e.g.  result of a reduction) do these loops have to be executed.
+
+With a chain of loops queued, OPS can analyse them together and come up with a tiled execution schedule.
+
+This works over MPI as well:  OPS extends the halo regions, and does one big halo exchange instead of several smaller ones. In the current `laplace2d` code, every stencil application loop is also doing a reduction, therefore only two loops are queued. Try modifying the code so the reduction only happens every 10 iterations ! On A Xeon E5-2650, one can get a 2.5x speedup.
+
+The following versions can be executed with the tiling optimzations.
+
+* `laplace2d_tiled`, `laplace2d_mpi_tiled` : add the `OPS_TILING` runtime flag, and move `-OPSDIAGS=3` to see the cache blocking tiling at work. For some applications, such as this one, the initial guess gives too large tiles, try setting `OPS_CACHE_SIZE` to a lower value (in MB, for L3 size).  Thread affinity control and using 1 process per socket isstrongly recommended.  E.g. `OMP_NUM_THREADS=20 numactl--cpunodebind=0 ./laplace2dtiled -OPSDIAGS=3 OPS_TILING OPS_CACHE_SIZE=5`. Over MPI, you will have to set `OPS_TILING_MAX_DEPTH` to extend halo regions.
+
 ## Supported Paralleizations
 ## Code-generation Flags
 ## Runtime Flags and Options

From 9594492091a013c7807379e460b2f98a3d9bc245 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 14:22:21 +0100
Subject: [PATCH 293/324] Update devanapp.md

---
 doc/devanapp.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 6179f2693c..dbb72e4401 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -267,12 +267,14 @@ Tiling uses lazy execution: as parallel loops follow one another, they are not e
 
 With a chain of loops queued, OPS can analyse them together and come up with a tiled execution schedule.
 
-This works over MPI as well:  OPS extends the halo regions, and does one big halo exchange instead of several smaller ones. In the current `laplace2d` code, every stencil application loop is also doing a reduction, therefore only two loops are queued. Try modifying the code so the reduction only happens every 10 iterations ! On A Xeon E5-2650, one can get a 2.5x speedup.
+This works over MPI as well:  OPS extends the halo regions, and does one big halo exchange instead of several smaller ones. In the current `laplace2d` code, every stencil application loop is also doing a reduction, therefore only two loops are queued. Try modifying the code so the reduction only happens every 10 iterations ! On a Xeon E5-2650, one can get a 2.5x speedup.
 
 The following versions can be executed with the tiling optimzations.
 
 * `laplace2d_tiled`, `laplace2d_mpi_tiled` : add the `OPS_TILING` runtime flag, and move `-OPSDIAGS=3` to see the cache blocking tiling at work. For some applications, such as this one, the initial guess gives too large tiles, try setting `OPS_CACHE_SIZE` to a lower value (in MB, for L3 size).  Thread affinity control and using 1 process per socket isstrongly recommended.  E.g. `OMP_NUM_THREADS=20 numactl--cpunodebind=0 ./laplace2dtiled -OPSDIAGS=3 OPS_TILING OPS_CACHE_SIZE=5`. Over MPI, you will have to set `OPS_TILING_MAX_DEPTH` to extend halo regions.
 
 ## Supported Paralleizations
-## Code-generation Flags
+
+<!-- ## Code-generation Flags !>
+
 ## Runtime Flags and Options

From b249fb59d820e2ef33831a81e7d492b489afc5c5 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 14:24:52 +0100
Subject: [PATCH 294/324] Update devanapp.md

---
 doc/devanapp.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index dbb72e4401..a9cd82bde4 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -275,6 +275,6 @@ The following versions can be executed with the tiling optimzations.
 
 ## Supported Paralleizations
 
-<!-- ## Code-generation Flags !>
+<!-- ## Code-generation Flags -->
 
 ## Runtime Flags and Options

From 3d38a6b6e2117d4ed56082c1bfa17ed63792160a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 16:07:15 +0100
Subject: [PATCH 295/324] Update introduction.md

---
 doc/introduction.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index ac9e9b2da0..1e25b5f3e1 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -2,7 +2,10 @@
 
 ## Overview
 
-[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language (eDSL) for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+
+The current OPS eDSL supports generating code targeting multi-core/multi-threaded CPUs, many-core GPUs and clusters of CPUs and GPUs using a range of paralleization models including SIMD vectorization, OpenMP, CUDA, OpenCL, OpenACC and their combinations with MPI. There is also experimental support for paralleizations using SYCL and AMD HIP. Various optimizations for each paralleization can be generated automatically, including cache blocking tiling to improve locality. The OPS API and library can also be used to solve scalar multi-dimensional tridiagonal systems using the [tridsolver](https://github.com/OP-DSL/tridsolver) library.
+
 
 ## Licencing
 OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.

From b53e7a2e6e5e19f183cd4b7405c09f73b3581f9f Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 16:32:41 +0100
Subject: [PATCH 296/324] Update devanapp.md

---
 doc/devanapp.md | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index a9cd82bde4..209bd295ed 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -273,8 +273,25 @@ The following versions can be executed with the tiling optimzations.
 
 * `laplace2d_tiled`, `laplace2d_mpi_tiled` : add the `OPS_TILING` runtime flag, and move `-OPSDIAGS=3` to see the cache blocking tiling at work. For some applications, such as this one, the initial guess gives too large tiles, try setting `OPS_CACHE_SIZE` to a lower value (in MB, for L3 size).  Thread affinity control and using 1 process per socket isstrongly recommended.  E.g. `OMP_NUM_THREADS=20 numactl--cpunodebind=0 ./laplace2dtiled -OPSDIAGS=3 OPS_TILING OPS_CACHE_SIZE=5`. Over MPI, you will have to set `OPS_TILING_MAX_DEPTH` to extend halo regions.
 
-## Supported Paralleizations
+<!--## Supported Paralleizations-->
 
 <!-- ## Code-generation Flags -->
 
 ## Runtime Flags and Options
+
+### General flags
+* `OPS_DIAGS=`
+* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`
+* `-gpudirect`
+* `OPS_CL_DEVICE=`
+* `OPS_TILING`
+* `OPS_TILING_MAXDEPTH=`
+
+### Tridsolver API flags
+* -halo 1 
+* -m
+* `-bx`, `-by` and `-bz`
+
+
+
+

From 734cf6563fa9febb4ae76f1940e72f63245c2129 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 18 Oct 2021 16:48:40 +0100
Subject: [PATCH 297/324] Update devdoc.md

---
 doc/devdoc.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index e959787cdd..85425fa416 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -9,6 +9,16 @@
 ### HDF5
 ### CUDA
 ### Cache blocking tiling and comm-avoiding optimizations
+
+## To contribute to OPS please use the following steps :
+
+Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).
+Create a new branch in your cloned repository
+Make changes / contributions in your new branch
+Submit your changes by creating a Pull Request to the `develop` branch of the OPS repository
+The contributions in the `develop` branch will be merged into the `master` branch as we create a new release.
+
+<!--
 ## Git work flow for contribution
 To facilitate the concept of "Version" and "Release", we adopt the [Gitflow Workflow model](#https://nvie.com/posts/a-successful-git-branching-model/).
 ### Overall work flow
@@ -41,4 +51,5 @@ Using the Gitflow model tends to produce a few long-live branches (e.g.,  featur
 **The Gitflow tool will automatically delete a branch once it is finished.**
 ### Gitflow tool
 
-see https://github.com/nvie/gitflow
\ No newline at end of file
+see https://github.com/nvie/gitflow
+-->

From e4ff5ec7c572d14be193c9bc37531381591632a1 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 19 Oct 2021 16:51:11 +0100
Subject: [PATCH 298/324] Update devanapp.md

---
 doc/devanapp.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 209bd295ed..03a556144b 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -288,8 +288,8 @@ The following versions can be executed with the tiling optimzations.
 * `OPS_TILING_MAXDEPTH=`
 
 ### Tridsolver API flags
-* -halo 1 
-* -m
+* `-halo 1`
+* `-m`
 * `-bx`, `-by` and `-bz`
 
 

From 6ae97e77a81a76cfdb01d3a5eee472d60aff90e7 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 16:38:55 +0100
Subject: [PATCH 299/324] Update pubs.md

---
 doc/pubs.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/pubs.md b/doc/pubs.md
index 9aca12e2f5..8c6f967998 100644
--- a/doc/pubs.md
+++ b/doc/pubs.md
@@ -1,2 +1,3 @@
 
 # Publications
+See [OP-DSL publications page](https://op-dsl.github.io/papers.html).

From c03899b29e8d33b0cdb2dc141eed7352e0ec3726 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 16:56:41 +0100
Subject: [PATCH 300/324] Update devdoc.md

---
 doc/devdoc.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index 85425fa416..fdc401efe3 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -1,4 +1,6 @@
 # Developer Guide
+Under construction.
+<!-- 
 ## Code-generator
 ### Frontend API parser
 ### Target Parallel Templates
@@ -9,7 +11,7 @@
 ### HDF5
 ### CUDA
 ### Cache blocking tiling and comm-avoiding optimizations
-
+-->
 ## To contribute to OPS please use the following steps :
 
 Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).

From db8de1f48d2cf20b2c417133e3d7fe341c2cc456 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 16:59:16 +0100
Subject: [PATCH 301/324] Update devdoc.md

---
 doc/devdoc.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index fdc401efe3..3d8efb5abc 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -14,10 +14,11 @@ Under construction.
 -->
 ## To contribute to OPS please use the following steps :
 
-Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).
-Create a new branch in your cloned repository
-Make changes / contributions in your new branch
-Submit your changes by creating a Pull Request to the `develop` branch of the OPS repository
+1. Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).
+2. Create a new branch in your cloned repository
+3. Make changes / contributions in your new branch
+4. Submit your changes by creating a Pull Request to the `develop` branch of the OPS repository
+
 The contributions in the `develop` branch will be merged into the `master` branch as we create a new release.
 
 <!--

From 431e83def652f5ac55ba758c38d05d6988642219 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:02:29 +0100
Subject: [PATCH 302/324] Update devdoc.md

---
 doc/devdoc.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/devdoc.md b/doc/devdoc.md
index 3d8efb5abc..9f8b4f6246 100644
--- a/doc/devdoc.md
+++ b/doc/devdoc.md
@@ -12,8 +12,9 @@ Under construction.
 ### CUDA
 ### Cache blocking tiling and comm-avoiding optimizations
 -->
-## To contribute to OPS please use the following steps :
+## Contributing
 
+To contribute to OPS please use the following steps :
 1. Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).
 2. Create a new branch in your cloned repository
 3. Make changes / contributions in your new branch

From 0a9c44c006be3594c54e13f9c7e25daeabbb91c6 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:07:09 +0100
Subject: [PATCH 303/324] Update perf.md

---
 doc/perf.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index be069ba51b..b723960e8b 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -45,7 +45,6 @@ arguments. For example,
 ```bash
 ./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4
 ```
-## CUDA-aware MPI
 ## OpenCL arguments
 
 `OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the

From 0087ff5e5b65212630dd3c38b7a0e7c8d624ecc5 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:25:58 +0100
Subject: [PATCH 304/324] Create numawrap

---
 scripts/numawrap | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 scripts/numawrap

diff --git a/scripts/numawrap b/scripts/numawrap
new file mode 100644
index 0000000000..e36467b975
--- /dev/null
+++ b/scripts/numawrap
@@ -0,0 +1,37 @@
+#!/bin/bash
+# e.g. mpirun -np 4 numawrap ./application
+
+# Find the rank of the process from the MPI local rank environment variable
+# to ensure unique output filenames.
+if [[ -n ${OMPI_COMM_WORLD_LOCAL_RANK} ]]
+    then
+    let lrank=${OMPI_COMM_WORLD_LOCAL_RANK}
+elif [[ -n ${MV2_COMM_WORLD_LOCAL_RANK} ]]
+    then
+    let lrank=${MV2_COMM_WORLD_LOCAL_RANK}
+elif [[ -n ${PMI_RANK} ]]
+    then
+    let lrank=${PMI_RANK}
+elif [[ -n ${PMI_ID} && -n ${MPISPAWN_LOCAL_NPROCS} ]]
+    then
+    let lrank=${PMI_ID}%${PERHOST}
+elif [[ -n ${MPI_LOCALRANKID} ]]
+    then
+    let lrank=${MPI_LOCALRANKID}
+else
+    echo could not determine local rank
+fi
+
+export CUDA_VISIBLE_DEVICES=${lrank}
+
+# let lrank=${PMI_RANK}
+echo $lrank
+
+# use  $lrank -lt 2 and  $lrank -ge 2 to distribute and bind 4 procs on to 2 CPUs
+if [[ $lrank -lt 2 ]]; then
+    numactl --cpunodebind=0 ${@}
+fi
+
+if [[ $lrank -ge 2 ]]; then
+    numactl --cpunodebind=1 ${@}
+fi

From 8eaf7e9ce5d84bde606f4c8cf0b79fa7f40e52d9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:28:12 +0100
Subject: [PATCH 305/324] Update perf.md

---
 doc/perf.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/doc/perf.md b/doc/perf.md
index b723960e8b..f90d15b29e 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -1,6 +1,6 @@
 # Performance Tuning
 
-## Vectorization
+<!--## Vectorization-->
 
 ## Executing with GPUDirect
 
@@ -22,7 +22,7 @@ When MPI is combined with OpenMP tiling can be extended to the MPI
 halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
 halos for multiple `ops_par_loops` can be exchanged with a single MPI
 message (see [@TPDS2017] for more details)\
-To test, compile CloverLeaf under ``apps/c/CloverLeaf``, modify clover.in
+To test, compile CloverLeaf under ``OPS/apps/c/CloverLeaf``, modify clover.in
 to use a $6144^2$ mesh, then run as follows:\
 For OpenMP with tiling:
 ```bash
@@ -37,7 +37,9 @@ OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:
 ```bash
 export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200
 ```
-## OpenMP with MPI
+## OpenMP and OpenMP+MPI
+It is recommended that you assign one MPI rank per NUMA region when executing MPI+OpenMP parallel code. Usually for a multi-CPU system a single CPU socket is a single NUMA region. Thus, for a 4 socket system, OPS's MPI+OpenMP code should be executed with 4 MPI processes with each MPI process having multiple OpenMP threads (typically specified by the `OMP_NUM_THREAD`s flag). Additionally on some systems using `numactl` to bind threads to cores could give performance improvements (see `OPS/scripts/numawrap` for an example script that wraps the `numactl` command to be used with common MPI distributions). 
+
 ## CUDA arguments
 The CUDA (and OpenCL) thread block sizes can be controlled by setting
 the ``OPS_BLOCK_SIZE_X``, ``OPS_BLOCK_SIZE_Y`` and ``OPS_BLOCK_SIZE_Z`` runtime
@@ -45,8 +47,8 @@ arguments. For example,
 ```bash
 ./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4
 ```
-## OpenCL arguments
 
+## OpenCL arguments
 `OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the
 code on.
 

From 79b35f2d8240ac6cd6c1b96e44be37b775efa43a Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:35:47 +0100
Subject: [PATCH 306/324] Update devanapp.md

---
 doc/devanapp.md | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/doc/devanapp.md b/doc/devanapp.md
index 03a556144b..40607e76a4 100644
--- a/doc/devanapp.md
+++ b/doc/devanapp.md
@@ -273,24 +273,7 @@ The following versions can be executed with the tiling optimzations.
 
 * `laplace2d_tiled`, `laplace2d_mpi_tiled` : add the `OPS_TILING` runtime flag, and move `-OPSDIAGS=3` to see the cache blocking tiling at work. For some applications, such as this one, the initial guess gives too large tiles, try setting `OPS_CACHE_SIZE` to a lower value (in MB, for L3 size).  Thread affinity control and using 1 process per socket isstrongly recommended.  E.g. `OMP_NUM_THREADS=20 numactl--cpunodebind=0 ./laplace2dtiled -OPSDIAGS=3 OPS_TILING OPS_CACHE_SIZE=5`. Over MPI, you will have to set `OPS_TILING_MAX_DEPTH` to extend halo regions.
 
-<!--## Supported Paralleizations-->
 
-<!-- ## Code-generation Flags -->
-
-## Runtime Flags and Options
-
-### General flags
-* `OPS_DIAGS=`
-* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`
-* `-gpudirect`
-* `OPS_CL_DEVICE=`
-* `OPS_TILING`
-* `OPS_TILING_MAXDEPTH=`
-
-### Tridsolver API flags
-* `-halo 1`
-* `-m`
-* `-bx`, `-by` and `-bz`
 
 
 

From 6d94ca448a3d7e3e506401dc1b9d9049b6ecaa3d Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:38:52 +0100
Subject: [PATCH 307/324] Update opsapi.md

---
 doc/opsapi.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index b8c9a908e7..d3b78dd7ac 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -683,5 +683,22 @@ This routine copies the data given  by the user to the internal data structure u
 |part|        the chunk index (has to be 0)|
 |data|        pointer to memory which should be copied to OPS |
 
+## Runtime Flags and Options
+
+The following is a list of all the runtime flags and options that can be used when executing OPS generated applications. 
+### General flags
+* `OPS_DIAGS=`
+* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`
+* `-gpudirect`
+* `OPS_CL_DEVICE=`
+* `OPS_TILING`
+* `OPS_TILING_MAXDEPTH=`
+
+### Tridsolver API flags
+* `-halo 1`
+* `-m`
+* `-bx`, `-by` and `-bz`
+
+
 ## Doxygen
 Doxygen generated from OPS source can be found [here](https://op-dsl-ci.gitlab.io/ops-ci/).

From 749d3213cd4010cbf53e93db512ae4a619c3d2d8 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 17:44:38 +0100
Subject: [PATCH 308/324] Update apps.md

---
 doc/apps.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/apps.md b/doc/apps.md
index 5bee74c5de..0b8fca0efb 100644
--- a/doc/apps.md
+++ b/doc/apps.md
@@ -1,5 +1,11 @@
 # Examples
-## CloverLeaf 2D 
+
+See `OPS/apps/[c|fortran]/[application]/test.sh` on compiling and running various parallel versions generated by OPS for each application.
+
+Further documentation under construction. 
+
+<!-- ## CloverLeaf 2D 
 ## CloverLeaf 3D with HDF5
 ## poisson
 ## adi
+-->

From bff0cee0790cab17ecb5f18f3e8586590a87e67b Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 22:07:22 +0100
Subject: [PATCH 309/324] Update perf.md

---
 doc/perf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index f90d15b29e..822160a597 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -21,7 +21,7 @@ in Megabytes. To manually specify the tile sizes, use the
 When MPI is combined with OpenMP tiling can be extended to the MPI
 halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
 halos for multiple `ops_par_loops` can be exchanged with a single MPI
-message (see [@TPDS2017] for more details)\
+message (see [TPDS2017](https://ieeexplore.ieee.org/abstract/document/8121995) for more details)\
 To test, compile CloverLeaf under ``OPS/apps/c/CloverLeaf``, modify clover.in
 to use a $6144^2$ mesh, then run as follows:\
 For OpenMP with tiling:

From 7c31de0548de267d2792ce264ba313354fc30707 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Wed, 20 Oct 2021 22:09:28 +0100
Subject: [PATCH 310/324] Update perf.md

---
 doc/perf.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/perf.md b/doc/perf.md
index 822160a597..1cec2e145f 100644
--- a/doc/perf.md
+++ b/doc/perf.md
@@ -38,7 +38,7 @@ OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:
 export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200
 ```
 ## OpenMP and OpenMP+MPI
-It is recommended that you assign one MPI rank per NUMA region when executing MPI+OpenMP parallel code. Usually for a multi-CPU system a single CPU socket is a single NUMA region. Thus, for a 4 socket system, OPS's MPI+OpenMP code should be executed with 4 MPI processes with each MPI process having multiple OpenMP threads (typically specified by the `OMP_NUM_THREAD`s flag). Additionally on some systems using `numactl` to bind threads to cores could give performance improvements (see `OPS/scripts/numawrap` for an example script that wraps the `numactl` command to be used with common MPI distributions). 
+It is recommended that you assign one MPI rank per NUMA region when executing MPI+OpenMP parallel code. Usually for a multi-CPU system a single CPU socket is a single NUMA region. Thus, for a 4 socket system, OPS's MPI+OpenMP code should be executed with 4 MPI processes with each MPI process having multiple OpenMP threads (typically specified by the `OMP_NUM_THREAD` flag). Additionally on some systems using `numactl` to bind threads to cores could give performance improvements (see `OPS/scripts/numawrap` for an example script that wraps the `numactl` command to be used with common MPI distributions). 
 
 ## CUDA arguments
 The CUDA (and OpenCL) thread block sizes can be controlled by setting

From b169cd7fe8c4369794251e75bcc5563cb72dfb33 Mon Sep 17 00:00:00 2001
From: Istvan Reguly <regulyistvan@gmail.com>
Date: Thu, 21 Oct 2021 16:57:45 +0200
Subject: [PATCH 311/324] C API clarification

---
 doc/opsapi.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index d3b78dd7ac..3562a11453 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -27,7 +27,7 @@ also under MPI).
 
 Reductions in loops are done using the `ops_arg_reduce` argument, which takes a reduction handle as an argument. The result of the reduction can then be acquired using a separate call to `ops_reduction_result`. The semantics are the following: a reduction handle after it was declared is in an "uninitialised" state. The first time it is used as an argument to a loop, its type is determined (increment/min/max), and is initialised appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in parallel loops are combined together, up until the point, where the result is acquired using `ops_reduction_result`, which then sets it back to an uninitialised state. This also implies, that different parallel loops, which all use the same reduction handle, but are otherwise independent, are independent and their partial reduction results can be combined together associatively and commutatively.
 
-OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **any data accesses or manipulation must only be done through the OPS API**.
+OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **data accesses or manipulation should only be done through the OPS API**. There is an external data access API that allows access to the data stored by OPS which in turn allows interfacing with external libraries.
 
 This restriction is exploited by a lazy execution mechanism in OPS. The idea is that OPS API calls that do not return a result need not be executed immediately, rather queued, and once an API call requires returning some data, operations in the queue are executed, and the result is returned. This allows OPS to analyse and optimise operations
 in the queue together. This mechanism is fully automated by OPS, and is used with the various `_tiled` executables. For more information on how to use this mechanism for improving CPU performance, see Section on Tiling. Some API calls triggering the execution of queued operations include `ops_reduction_result`, and the functions in the
@@ -43,7 +43,7 @@ To further clarify some of the important issues encountered when designing the O
 
 OPS handle all of these different requirements through stencil definitions.
 
-## C/C++ API
+## C API
 
 ### Initialisation and termination routines
 
@@ -567,7 +567,7 @@ in a tech-report on checkpointing, to be published later.
 
 ### Access to OPS data
 
-his section describes APIS that give the user access to internal data structures in OPS and return data to user-space. These should be used cautiously and sparsely, as they can affect performance significantly
+This section describes APIs that give the user access to internal data structures in OPS and return data to user-space. These should be used cautiously and sparsely, as they can affect performance significantly
 
 #### ops_dat_get_local_npartitions
 

From 361e126adfb722a383d7b50580f22d74c804f598 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 26 Oct 2021 10:28:42 +0100
Subject: [PATCH 312/324] Update introduction.md

---
 doc/introduction.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index 1e25b5f3e1..d12c564010 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -2,9 +2,11 @@
 
 ## Overview
 
-[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language (eDSL) for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language (eDSL) for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. 
 
-The current OPS eDSL supports generating code targeting multi-core/multi-threaded CPUs, many-core GPUs and clusters of CPUs and GPUs using a range of paralleization models including SIMD vectorization, OpenMP, CUDA, OpenCL, OpenACC and their combinations with MPI. There is also experimental support for paralleizations using SYCL and AMD HIP. Various optimizations for each paralleization can be generated automatically, including cache blocking tiling to improve locality. The OPS API and library can also be used to solve scalar multi-dimensional tridiagonal systems using the [tridsolver](https://github.com/OP-DSL/tridsolver) library.
+The current OPS eDSL supports generating code targeting multi-core/multi-threaded CPUs, many-core GPUs and clusters of CPUs and GPUs using a range of paralleization models including SIMD vectorization, OpenMP, CUDA, OpenCL, OpenACC and their combinations with MPI. There is also experimental support for paralleizations using SYCL and AMD HIP. Various optimizations for each paralleization can be generated automatically, including cache blocking tiling to improve locality. The OPS API and library can also be used to solve multi-dimensional tridiagonal systems using the [tridsolver](https://github.com/OP-DSL/tridsolver) library.
+
+These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
 
 
 ## Licencing

From 50bf119de3b96f6925f78bc42582f95d4732d02c Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 26 Oct 2021 10:29:22 +0100
Subject: [PATCH 313/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index d12c564010..c3a73f687a 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -10,7 +10,7 @@ These pages provide detailed documentation on using OPS, including an installati
 
 
 ## Licencing
-OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
+OPS is released as an open-source project under the BSD 3-Clause License. See the [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) file for more information.
 
 ## Citing
 To cite OPS, please reference the following paper:

From cba58df97498da2e35db741351c1461a2196741c Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 26 Oct 2021 10:31:30 +0100
Subject: [PATCH 314/324] Update introduction.md

---
 doc/introduction.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/introduction.md b/doc/introduction.md
index c3a73f687a..b458afae3d 100644
--- a/doc/introduction.md
+++ b/doc/introduction.md
@@ -36,4 +36,4 @@ The preferred method of reporting bugs and issues with OPS is to submit an issue
 ## Funding
 The development of OPS was in part supported by the UK Engineering and Physical Sciences Research Council (EPSRC) grants [EP/K038494/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/K038494/1) (“Future-proof massively-parallel execution of multi-block applications”), [EP/J010553/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/J010553/1) (“Software for Emerging Architectures - ASEArch"), The UK Turbulence Consortium grant [EP/T026170/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/T026170/1), The Janos Bolyai Research Scholarship of the Hungarian Academy of Sciences, the Royal Society through their Industry Fellowship Scheme (INF/R1/180012), and the Thematic Research Cooperation Establishing Innovative Informatic and Info-communication Solutions Project, which has been supported by the European Union and co-financed by the European Social Fund under grant number EFOP-3.6.2-16-2017-00013. Research funding support was also provided by the UK AWE under grants CDK0660 ("The Production of Predictive Models for Future Computing Requirements"), CDK0724 ("AWE Technical Outreach Programme"), AWE grant for "High-level Abstractions for Performance, Portability and Continuity of Scientific Software on Future Computing Systems" and the Numerical Algorithms Group [NAG](https://www.nag.com/).
 
-Hardware resources for development and testing provided by the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725, the [ARCHER](http://www.archer.ac.uk) and ARCHER2(https://www.archer2.ac.uk/) UK National Supercomputing Service, [University of Oxford Advanced Research Computing (ARC) facility](http://dx.doi.org/10.5281/zenodo.22558) and through hardware donations and access provided by NVIDIA and Intel.
+Hardware resources for development and testing provided by the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725, the [ARCHER](http://www.archer.ac.uk) and [ARCHER2](https://www.archer2.ac.uk/) UK National Supercomputing Service, [University of Oxford Advanced Research Computing (ARC) facility](http://dx.doi.org/10.5281/zenodo.22558) and through hardware donations and access provided by NVIDIA and Intel.

From c9cdec0ec818ebbf9c4a75c6b05e6b5c08923378 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 26 Oct 2021 10:47:12 +0100
Subject: [PATCH 315/324] Update installation.md

---
 doc/installation.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/doc/installation.md b/doc/installation.md
index 97031296b2..e685820de7 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -4,6 +4,8 @@
 
 ## Dependencies
 
+The following prerequisites and dependencies are required for building OPS. Building each of the **backends** are optional and depends on the hardware and/or capabilities you will be targeting. 
+
   **CMake**
 
 CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
@@ -26,19 +28,19 @@ Python2 is required by the OPS Python translator. The CMake build system will tr
 
 [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using `-DHDF5_ROOT`.
 
- **CUDA**
+ **CUDA Backend**
  
 The [CUDA](https://developer.nvidia.com/cuda-downloads) backend targets NVIDIA GPUs with a compute capability of 3.0 or greater. The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
 
-**HIP**
+**HIP Backend**
 
-The HIP backend targets AMD GPUs and NVIDIA GPUs which are supported by HIP - either through its CUDA support or the ROCm stack (tested with >=3.9). 
+The HIP backend targets AMD GPUs and NVIDIA GPUs which are supported by HIP - either through its CUDA support or the [ROCm](https://rocmdocs.amd.com/en/latest/) stack (tested with >=3.9). 
 
-**SYCL**
+**SYCL Backend**
 
-The SYCL backend is currently in development and only working without MPI. It has been tested with Intel OneAPI (>=2021.1), Intel's public LLVM version, and hipSYCL (>=0.9.1), and runs on Intel CPUs and GPUs through Intel's OpenCL and Level Zero, NVIDIA and AMD GPUs both with the LLVM fork as well as hipSYCL. hipSYCL's OpenMP support covers most CPU architectures too.
+The [SYCL](https://www.khronos.org/sycl/) backend is currently in development and only working without MPI. It has been tested with Intel OneAPI (>=2021.1), Intel's public LLVM version, and hipSYCL (>=0.9.1), and runs on Intel CPUs and GPUs through Intel's OpenCL and Level Zero, NVIDIA and AMD GPUs both with the LLVM fork as well as hipSYCL. hipSYCL's OpenMP support covers most CPU architectures too.
 
-**Tridiagonal Solver**
+**Tridiagonal Solver Backend**
 
 To use the tridiagonal solver OPS API in applications and build example applications such as `adi`, `adi_burger` and `adi_burger_3D` the open source tridiagonal solver (scalar) library needs to be cloned and built from the [Tridsolver repository](https://github.com/OP-DSL/tridsolver). 
 ```bash

From f8faa3cb5a68d96e37bd4c0193ea3c0d531a3335 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Tue, 26 Oct 2021 10:49:44 +0100
Subject: [PATCH 316/324] Update installation.md

---
 doc/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/installation.md b/doc/installation.md
index e685820de7..cbc04f505a 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -127,7 +127,7 @@ The current tests are mainly based on the applications.
   * `MPI_INSTALL_PATH` - Installation directory of MPI (to build MPI based distributed memory libs and applications)
   * `HDF5_INSTALL_PATH` - Installation directory of HDF5 (to support HDF5 based File I/O)
 
-See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) under `OPS/ops/scripts` that sets up the environment for building with various compilers (Intel, PGI, Cray).
+See example scripts (e.g. `source_intel`, `source_pgi_15.10`, `source_cray`) under `OPS/ops/scripts` that sets up the environment for building with various compilers (Intel, PGI, Cray).
 
 #### Build back-end library
 For C/C++ back-end use Makefile under `OPS/ops/c` (modify Makefile if required). The libraries will be built in `OPS/ops/c/lib`

From ba9aae9d8dea82f2737a7e7b72cbeae1e9603022 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Fri, 29 Oct 2021 17:36:53 +0100
Subject: [PATCH 317/324] Minor upgrade on Doxyfile

---
 doc/ops/Doxyfile | 253 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 172 insertions(+), 81 deletions(-)

diff --git a/doc/ops/Doxyfile b/doc/ops/Doxyfile
index 32f7c733c8..a116d6466f 100644
--- a/doc/ops/Doxyfile
+++ b/doc/ops/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.11
+# Doxyfile 1.8.16
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -17,11 +17,11 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "OPS"
+PROJECT_NAME           = OPS
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -179,6 +187,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = YES
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -226,7 +244,12 @@ TAB_SIZE               = 4
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
 # "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
@@ -264,17 +287,26 @@ OPTIMIZE_FOR_FORTRAN   = YES
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
 # language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is
+# Fortran), use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
@@ -285,7 +317,7 @@ EXTENSION_MAPPING      = inc=Fortran
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -293,6 +325,15 @@ EXTENSION_MAPPING      = inc=Fortran
 
 MARKDOWN_SUPPORT       = YES
 
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -309,7 +350,7 @@ AUTOLINK_SUPPORT       = YES
 # diagrams that involve STL classes more complete and accurate.
 # The default value is: NO.
 
-BUILTIN_STL_SUPPORT    = NO
+BUILTIN_STL_SUPPORT    = YES
 
 # If you use Microsoft's C++/CLI language, you should set this option to YES to
 # enable parsing support.
@@ -318,7 +359,7 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -424,6 +465,12 @@ EXTRACT_ALL            = YES
 
 EXTRACT_PRIVATE        = NO
 
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
@@ -502,7 +549,7 @@ INTERNAL_DOCS          = NO
 # names in lower-case letters. If set to YES, upper-case letters are also
 # allowed. This is useful if you have classes or files whose names only differ
 # in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# (including Cygwin) ands Mac users are advised to set this option to NO.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
@@ -512,7 +559,7 @@ CASE_SENSE_NAMES       = YES
 # scope will be hidden.
 # The default value is: NO.
 
-HIDE_SCOPE_NAMES       = NO
+HIDE_SCOPE_NAMES       = YES
 
 # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
 # append additional text to a page's title, such as Class Reference. If set to
@@ -689,7 +736,7 @@ LAYOUT_FILE            =
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -734,7 +781,8 @@ WARN_IF_DOC_ERROR      = YES
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
 # value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
@@ -771,12 +819,16 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = ../README.md ../ops/c/src ../ops/c/include ../ops/fortran/src ../ops/fortran/include
+INPUT                  = ../README.md \
+                         ../ops/c/src \
+                         ../ops/c/include \
+                         ../ops/fortran/src \
+                         ../ops/fortran/include
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
 # possible encodings.
 # The default value is: UTF-8.
 
@@ -793,8 +845,8 @@ INPUT_ENCODING         = UTF-8
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
-# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          =
 
@@ -949,7 +1001,7 @@ INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -981,12 +1033,12 @@ SOURCE_TOOLTIPS        = YES
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1014,7 +1066,7 @@ VERBATIM_HEADERS       = YES
 # rich C++ code for which doxygen's built-in parser lacks the necessary type
 # information.
 # Note: The availability of this option depends on whether or not doxygen was
-# generated with the -Duse-libclang=ON option for CMake.
+# generated with the -Duse_libclang=ON option for CMake.
 # The default value is: NO.
 
 CLANG_ASSISTED_PARSING = NO
@@ -1027,6 +1079,16 @@ CLANG_ASSISTED_PARSING = NO
 
 CLANG_OPTIONS          =
 
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1145,7 +1207,7 @@ HTML_EXTRA_FILES       =
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1181,6 +1243,17 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 HTML_TIMESTAMP         = NO
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1204,13 +1277,13 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# environment (see: https://developer.apple.com/xcode/), introduced with OSX
+# 10.5 (Leopard). To create a documentation set, doxygen will generate a
 # Makefile in the HTML output directory. Running make will produce the docset in
 # that directory and running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1249,7 +1322,7 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
 # Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
@@ -1325,7 +1398,7 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1333,7 +1406,7 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
 # folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
@@ -1342,7 +1415,7 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1350,7 +1423,7 @@ QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1358,7 +1431,7 @@ QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
@@ -1416,7 +1489,7 @@ DISABLE_INDEX          = NO
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-GENERATE_TREEVIEW      = NO
+GENERATE_TREEVIEW      = YES
 
 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
 # doxygen will group on one line in the generated HTML documentation.
@@ -1451,7 +1524,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
@@ -1463,7 +1536,7 @@ FORMULA_FONTSIZE       = 10
 FORMULA_TRANSPARENT    = YES
 
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side Javascript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1490,8 +1563,8 @@ MATHJAX_FORMAT         = NativeMML
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
@@ -1501,7 +1574,8 @@ MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
+MATHJAX_EXTENSIONS     = TeX/AMSmath \
+                         TeX/AMSsymbols
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1552,7 +1626,7 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see: https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1565,7 +1639,7 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
@@ -1617,21 +1691,35 @@ LATEX_OUTPUT           = ops/latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1752,7 +1840,7 @@ LATEX_SOURCE_CODE      = NO
 
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1766,6 +1854,14 @@ LATEX_BIB_STYLE        = plain
 
 LATEX_TIMESTAMP        = NO
 
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1805,9 +1901,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1816,8 +1912,8 @@ RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
@@ -1903,6 +1999,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1935,9 +2038,9 @@ DOCBOOK_PROGRAMLISTING = NO
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -2037,7 +2140,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS
+PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2104,12 +2207,6 @@ EXTERNAL_GROUPS        = YES
 
 EXTERNAL_PAGES         = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
@@ -2123,15 +2220,6 @@ PERL_PATH              = /usr/bin/perl
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
@@ -2150,7 +2238,7 @@ HIDE_UNDOC_RELATIONS   = YES
 # http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
-# The default value is: YES.
+# The default value is: NO.
 
 HAVE_DOT               = YES
 
@@ -2306,9 +2394,7 @@ DIRECTORY_GRAPH        = YES
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
-# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
-# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
-# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
 # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
 # png:gdiplus:gdiplus.
 # The default value is: png.
@@ -2361,6 +2447,11 @@ DIAFILE_DIRS           =
 
 PLANTUML_JAR_PATH      =
 
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 

From c77d69964330d2687f7e52d02669f35aab778466 Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Wed, 3 Nov 2021 12:34:32 +0000
Subject: [PATCH 318/324] C++ API instance block

---
 doc/opsapi.md | 95 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 78 insertions(+), 17 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 3562a11453..ec79b0b90f 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -43,11 +43,15 @@ To further clarify some of the important issues encountered when designing the O
 
 OPS handle all of these different requirements through stencil definitions.
 
-## C API
+## OPS C and C++ API
 
-### Initialisation and termination routines
+Both C and C++ styles API are provided for utilizing the capabilities provided by the OPS library. They are essentially the same although there are minor differences in syntax. The C++ API is mainly designed for data abstraction, which  therefore provides better data encapsulation and the support of multiple instances and threading (OpenMP currently). In the following both C style routines and C++ class and methods will be introduced according their functionality with a notice (C) or (C++). If there is no such notice, the routine will apply to both.
+
+To enable the C++ API, a compiler directive ``OPS_CPP_API`` is required.
 
-#### ops_init
+### Initialisation and termination routines
+#### C Style
+##### ops_init
 
 __void ops_init(int argc, char** argv, int diags_level)__
 
@@ -77,10 +81,26 @@ MPI receives depth (for OPS internal development only)
 __void ops_exit()__
 
 This routine must be called last to cleanly terminate the OPS computation.
+#### C++ style
+
+With the C++ style APIs, all data structures (block, data and stencils etc ) are encapsulated into a class  ``OPS_instance``. Thus, we can allocate multiple instances of ``OPS_instance`` by using the class constructor, for example,
+
+```c++
+// Allocate an instance
+OPS_instance *instance = new OPS_instance(argc,argv,1,ss);
+```
+
+where the meaning of arguments are same to the C API, while the extra argument (i.e., ss) is for accpeting the messages.
+
+An explicit termination is not needed for the C++ API, although we need to "delete" the instance in if it is allocated through pointer, i.e.,
+```C++
+delete instance;
+```
 
 ### Declaration routines
 
-#### ops_decl_block
+#### Block
+##### ops_decl_block (C)
 
 __ops_block ops_decl_block(int dims, char *name)__
 
@@ -90,7 +110,15 @@ This routine defines a structured grid block.
 | dims    | dimension of the block    |
 | name  |  a name used for output diagnostics |
 
-#### ops_decl_block_hdf5
+##### OPS_instance::decl_block (C++)
+
+A method of the OPS_instance class for declaring a block, which accepts same arguments with the C style function. A OPS_instance object should be constructed before this. The method returns a pointer to a ops_block type variable, where ops_block is an alias to a pointer type of ops_block_core. An example is
+
+```C++
+ops_block grid2D = instance->decl_block(2, "grid2D");
+```
+
+##### ops_decl_block_hdf5 (C)
 
 __ops_block ops_decl_block_hdf5(int dims, char *name, char *file)__
 
@@ -108,7 +136,8 @@ arguments, it is included here for error checking (e.g. check if blocks
 defined in an HDF5 file is matching with the declared arguments in an
 application) and completeness.
 
-#### ops_decl_dat
+#### Dat (ops_cat_core)
+##### ops_decl_dat (C)
 
 __ops_dat ops_decl_dat(ops block block, int dim, int *size, int *base, int *dm, int *d p, T *data, char *type, char *name)__
 
@@ -131,7 +160,16 @@ The `size` allows to declare different sized data arrays on a given
 indicate the offset from the edge of a block (in both the negative and
 positive directions of each dimension).
 
-#### ops_decl_dat_hdf5
+##### ops_block_core::decl_dat (C++)
+The method ops_block_core::decl_dat is used to define a ops_dat object, which accepts almost same arguments with the C conterpart where the block argument is not necessary, e.g.,
+```C++
+//declare ops_dat with dim = 2
+ops_dat dat0    = grid2D->decl_dat(2, size, base, d_m, d_p, temp, "double", "dat0");
+ops_dat dat1    = grid2D->decl_dat(2, size, base, d_m, d_p, temp, "double", "dat1");
+```
+where grid2D is a ops_block_core object which shall be defined before this.
+
+##### ops_decl_dat_hdf5 (C)
 
 __ops_dat ops_decl_dat_hdf5(ops_block block, int dim, char *type, char *name, char *file)__
 
@@ -145,12 +183,13 @@ type    |  the name of type used for output diagnostics (e.g. ``double``,``float
 |name   |   name of the dat used for output diagnostics|
 |file   |   hdf5 file to read and obtain the data from|
 
-#### ops_decl_const
+#### Global constant
+##### ops_decl_const (C)
 
 __void ops_decl_const(char const * name, int dim, char const * type, T * data )__
 
 This routine defines a global constant: a variable in global scope. Global constants need to be declared upfront
- so that they can be correctly handled for different parallelizations. For e.g CUDA on GPUs. Once defined
+ so that they can be correctly handled for different parallelization. For e.g CUDA on GPUs. Once defined
  they remain unchanged throughout the program, unless changed by a call to ops_update_const(..). The ``name'' and``type''
  parameters **must** be string literals since they are used in the code generation step
 
@@ -161,7 +200,12 @@ This routine defines a global constant: a variable in global scope. Global const
 |type |          the name of type used for output diagnostics (e.g. ``double``, ``float``) |
 |data |          pointer to input data of type *T* |
 
-#### ops_decl_halo
+##### OPS_instance::decl_const (C++)
+
+The method accepts same arguments with its C counterpart.
+
+#### Halo definition
+##### ops_decl_halo (C)
 
 __ops_halo ops_decl_halo(ops_dat from, ops_dat to, int *iter_size, int* from_base, int *to_base, int *from_dir, int *to_dir)__
 
@@ -183,7 +227,10 @@ iter_size = \[2,100,100\], from_base = \[1,0,0\], to_base = \[-1,0,0\],
 from_dir = \[0,1,2\], to_dir = \[0,1,2\]. In more complex case this
 allows for transfers between blocks with different orientations.)
 
-#### ops_decl_halo_hdf5
+##### OPS_instance::decl_halo (C++)
+The method accepts same arguments with its C counterpart.
+
+##### ops_decl_halo_hdf5 (C)
 
 __ops_halo ops_decl_halo_hdf5(ops_dat from, ops_dat to, char* file)__
 
@@ -195,7 +242,7 @@ This routine reads in a halo relationship between two datasets defined on two di
 |to|        destination dataset|
 |file|      hdf5 file to read and obtain the data from|
 
-#### ops_decl_halo_group
+##### ops_decl_halo_group (C)
 
 __ops_halo_group ops_decl_halo_group(int nhalos, ops_halo *halos)__
 
@@ -205,7 +252,12 @@ This routine defines a collection of halos. Semantically, when an exchange is tr
 |nhalos|         number of halos in *halos* |
 |halos|           array of halos|
 
-#### ops_decl_reduction_handle
+##### OPS_instance::decl_halo_group (C++)
+
+The method accepts same arguments with its C counterpart.
+
+#### Reduction handle
+##### ops_decl_reduction_handle (C)
 
 __ops_reduction ops_decl_reduction_handle(int size, char *type, char *name)__
 This routine defines a reduction handle to be used in a parallel loop
@@ -222,7 +274,10 @@ __{void ops_reduction_result(ops_reduction handle, T *result)
 |handle|  the *ops_reduction* handle |
 |result|  a pointer to write the results to, memory size has to match the declared |
 
-#### ops_partition
+##### OPS_instance::decl_reduction_handle (C++)
+The method accepts same arguments with its C counterpart.
+#### Partition
+##### ops_partition (C)
 
 __ops_partition(char *method)__
 
@@ -233,15 +288,21 @@ and ops_halo ops_decl_dat statements have been declared
 | ----------- | ----------- |
 |method| string describing the partitioning method. Currently this string is not used internally, but is simply a place-holder to indicate different partitioning methods in the future. |
 
+
+##### OPS_instance::partition (C++)
+
+The method accepts same arguments with its C counterpart.
 ### Diagnostic and output routines
 
-#### ops_diagnostic_output
+#### ops_diagnostic_output (C)
 
 __void ops_diagnostic_output()__
 
 This routine prints out various useful bits of diagnostic info about sets, mappings and datasets. Usually used right
 after an ops_partition() call to print out the details of the decomposition
 
+#### OPS_instance::diagnostic_output (C++)
+Same to the C counterpart.
 #### ops_printf
 
 __void ops_printf(const char * format, ...)__
@@ -329,7 +390,7 @@ is found, prints an error message and exits.
 
 ### Halo exchange
 
-#### ops_halo_transfer
+#### ops_halo_transfer (C)
 
 __void ops_halo_transfer(ops_halo_group group)__
 
@@ -685,7 +746,7 @@ This routine copies the data given  by the user to the internal data structure u
 
 ## Runtime Flags and Options
 
-The following is a list of all the runtime flags and options that can be used when executing OPS generated applications. 
+The following is a list of all the runtime flags and options that can be used when executing OPS generated applications.
 ### General flags
 * `OPS_DIAGS=`
 * `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`

From 6a6b9700850c2d7e43e9998798062aef9fc673dc Mon Sep 17 00:00:00 2001
From: Toby Flynn <toby_flynn@btinternet.com>
Date: Mon, 1 Nov 2021 17:50:02 +0000
Subject: [PATCH 319/324] OPS-Tridsolver docs

---
 doc/opsapi.md | 54 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index ec79b0b90f..00551b64ed 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -724,6 +724,7 @@ A single call to ops_dat_release_raw_data() releases all pointers obtained by pr
 #### ops_dat_fetch_data
 
 __void ops_dat_fetch_data(ops_dat dat, int part, int *data)__
+
 This routine copies the data held by OPS to the user-specified memory location, which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
 
 | Arguments      | Description |
@@ -744,6 +745,53 @@ This routine copies the data given  by the user to the internal data structure u
 |part|        the chunk index (has to be 0)|
 |data|        pointer to memory which should be copied to OPS |
 
+### Tridsolver Calls
+This section specifies APIs that allow [Tridsolver](https://github.com/OP-DSL/tridsolver) (a tridiagonal solver library) to be called from OPS. The library can be used to solve a large number of tridiagonal systems of equations stored in multidimensional datasets. Parameters that are passed to Tridsolver from OPS are stored in an `ops_tridsolver_params` object. The constructor for this class takes the `ops_block` that the datasets are defined over as an argument and optionally also a solving strategy to use (only relevant to MPI applications). The following solving strategies are available (see Tridsolver for more details about these):
+
+- GATHER_SCATTER (not available for GPUs)
+- ALLGATHER
+- LATENCY_HIDING_TWO_STEP
+- LATENCY_HIDING_INTERLEAVED
+- JACOBI
+- PCR (default)
+
+Then parameters specific to different solving strategies can be set using setter methods. For applications using MPI, it is beneficial to reuse `ops_tridsolver_params` objects between solves as much as possible due to set up times involved with creating Tridsolver's MPI communicators.
+
+#### ops_tridMultiDimBatch
+
+__void ops_tridMultiDimBatch(int ndim, int solvedim, int* dims, ops_dat a, ops_dat b, ops_dat c, ops_dat d, ops_tridsolver_params *tridsolver_ctx)__
+
+This solves multiple tridiagonal systems of equations in multidimensional datasets along the specified dimension. The matrix is stored in the `a` (bottom diagonal), `b` (central diagonal) and `c` (top diagonal) datasets. The right hand side is stored in the `d` dataset and the result is also written to this dataset.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndim| the dimension of the datasets |
+|solvedim| the dimension to solve along |
+|dims| the size of each dimension (excluding any padding) |
+|a| the dataset for the lower diagonal |
+|b| the dataset for the central diagonal |
+|c| the dataset for the upper diagonal |
+|d| the dataset for the right hand side, also where the solution is written to |
+|tridsolver_ctx| an object containing the parameters for the Tridsolver library |
+
+#### ops_tridMultiDimBatch_Inc
+
+__void ops_tridMultiDimBatch(int ndim, int solvedim, int* dims, ops_dat a, ops_dat b, ops_dat c, ops_dat d, ops_dat u, ops_tridsolver_params *tridsolver_ctx)__
+
+This solves multiple tridiagonal systems of equations in multidimensional datasets along the specified dimension. The matrix is stored in the `a` (bottom diagonal), `b` (central diagonal) and `c` (top diagonal) datasets. The right hand side is stored in the `d` dataset and the result is added to the `u` dataset.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndim| the dimension of the datasets |
+|solvedim| the dimension to solve along |
+|dims| the size of each dimension (excluding any padding) |
+|a| the dataset for the lower diagonal |
+|b| the dataset for the central diagonal |
+|c| the dataset for the upper diagonal |
+|d| the dataset for the right hand side |
+|u| the dataset that the soluion is added to |
+|tridsolver_ctx| an object containing the parameters for the Tridsolver library |
+
 ## Runtime Flags and Options
 
 The following is a list of all the runtime flags and options that can be used when executing OPS generated applications.
@@ -755,11 +803,5 @@ The following is a list of all the runtime flags and options that can be used wh
 * `OPS_TILING`
 * `OPS_TILING_MAXDEPTH=`
 
-### Tridsolver API flags
-* `-halo 1`
-* `-m`
-* `-bx`, `-by` and `-bz`
-
-
 ## Doxygen
 Doxygen generated from OPS source can be found [here](https://op-dsl-ci.gitlab.io/ops-ci/).

From 5bd02d47ca8a286ce0c76b88d989641781e0b88e Mon Sep 17 00:00:00 2001
From: Jianping Meng <jpmeng@gmail.com>
Date: Thu, 4 Nov 2021 09:52:18 +0000
Subject: [PATCH 320/324] C++ API Data Access

---
 doc/opsapi.md | 98 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 33 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 00551b64ed..4ab720fceb 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -45,7 +45,7 @@ OPS handle all of these different requirements through stencil definitions.
 
 ## OPS C and C++ API
 
-Both C and C++ styles API are provided for utilizing the capabilities provided by the OPS library. They are essentially the same although there are minor differences in syntax. The C++ API is mainly designed for data abstraction, which  therefore provides better data encapsulation and the support of multiple instances and threading (OpenMP currently). In the following both C style routines and C++ class and methods will be introduced according their functionality with a notice (C) or (C++). If there is no such notice, the routine will apply to both.
+Both C and C++ styles API are provided for utilizing the capabilities provided by the OPS library. They are essentially the same although there are minor differences in syntax. The C++ API is mainly designed for data abstraction, which  therefore provides better data encapsulation and the support of multiple instances and threading (OpenMP currently). In the following both C style routines and C++ class and methods will be introduced according to their functionality with a notice (C) or (C++). If there is no such notice, the routine either applies to both or might not provided by the C++ API.
 
 To enable the C++ API, a compiler directive ``OPS_CPP_API`` is required.
 
@@ -474,7 +474,7 @@ used to generate initial geometry.
 
 The final ingredient is the stencil specification, for which we have two versions: simple and strided.
 
-#### ops_decl_stencil
+#### ops_decl_stencil (C)
 
 __ops_stencil ops_decl_stencil(int dims,int points, int *stencil, char *name)__
 
@@ -485,7 +485,10 @@ __ops_stencil ops_decl_stencil(int dims,int points, int *stencil, char *name)__
 |stencil|  stencil for accessing data|
 |name| string representing the name of the stencil|
 
-#### ops_decl_strided_stencil
+#### OPS_instance::decl_stencil (C++)
+
+The method accepts same arguments with its C counterpart.
+#### ops_decl_strided_stencil (C)
 
 __ops_stencil ops_decl_strided_stencil(int dims, int points, int *stencil, int *stride, char *name)__
 
@@ -497,6 +500,10 @@ __ops_stencil ops_decl_strided_stencil(int dims, int points, int *stencil, int *
 |stride|     stride for accessing data|
 |name| string representing the name of the stencil|
 
+#### OPS_instance::decl_strided_stencil (C++)
+
+The method accepts same arguments with its C counterpart.
+
 #### ops_decl_stencil_hdf5
 
 __ops_stencil ops_decl_stencil_hdf5(int dims,int points, char *name, char* file)__
@@ -537,10 +544,7 @@ dimension applications (with a stride of 0 for the relevant dimensions).
 
 ### Checkpointing
 
-OPS supports the automatic checkpointing of applications. Using the API below, the user specifies the file name for the
-checkpoint and an average time interval between checkpoints, OPS will then automatically save all necessary information
-periodically that is required to fast-forward to the last checkpoint if a crash occurred. Currently, when re-launching
-after a crash, the same number of MPI processes have to be used. To enable checkpointing mode, the *OPS_CHECKPOINT* runtime argument has to be used.
+OPS supports the automatic checkpointing of applications. Using the API below, the user specifies the file name for the checkpoint and an average time interval between checkpoints, OPS will then automatically save all necessary information periodically that is required to fast-forward to the last checkpoint if a crash occurred. Currently, when re-launching after a crash, the same number of MPI processes have to be used. To enable checkpointing mode, the *OPS_CHECKPOINT* runtime argument has to be used. (**Do we also need to define the CHECKPOINTING compiler directive?**)
 
 #### ops_checkpointing_init
 
@@ -559,8 +563,7 @@ mode, false otherwise.
 
 * OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually controls the location of the checkpoint, and explicitly specifies the list of *ops_dat*s to be saved.
 
-* OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the location of the checkpoint, and it also enables fast-forwarding, by skipping the execution of the
-application (even though none of the parallel loops would actually execute, there may be significant work outside of those) up to the checkpoint
+* OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the location of the checkpoint, and it also enables fast-forwarding, by skipping the execution of the application (even though none of the parallel loops would actually execute, there may be significant work outside of those) up to the checkpoint
 
 * OPS_CHECKPOINT_MANUAL - Indicates that when the corresponding API function is called, the checkpoint should be created. Assumes the presence of the above two options as well.
 
@@ -570,8 +573,7 @@ __void ops_checkpointing_manual_datlist(int ndats, ops_dat *datlist)__
 
 A user can call this routine at a point in the code to mark the location of a checkpoint.  At this point, the list of datasets specified
 will be saved. The validity of what is saved is not checked by the checkpointing algorithm assuming that the user knows
-what data sets to be saved for full recovery. This routine should be called frequently (compared to check-pointing
-frequency) and it will trigger the creation of the checkpoint the first time it is called after the timeout occurs.
+what data sets to be saved for full recovery. This routine should be called frequently (compared to check-pointing frequency) and it will trigger the creation of the checkpoint the first time it is called after the timeout occurs.
 
 | Arguments      | Description |
 | ----------- | ----------- |
@@ -630,7 +632,7 @@ in a tech-report on checkpointing, to be published later.
 
 This section describes APIs that give the user access to internal data structures in OPS and return data to user-space. These should be used cautiously and sparsely, as they can affect performance significantly
 
-#### ops_dat_get_local_npartitions
+#### ops_dat_get_local_npartitions (C)
 
 __int ops_dat_get_local_npartitions(ops_dat dat)__
 
@@ -640,7 +642,9 @@ This routine returns the number of chunks of the given dataset held by the curre
 | ----------- | ----------- |
 |dat|         the dataset|
 
-#### ops_dat_get_global_npartitions}
+#### ops_dat_core::get_local_npartitions (C++)
+The C++ version of ``ops_dat_get_local_npartitions``, which does not require input.
+#### ops_dat_get_global_npartitions (C)
 
 __int ops_dat_get_global_npartitions(ops_dat dat)__
 
@@ -650,7 +654,9 @@ This routine returns the number of chunks of the given dataset held by all proce
 | ----------- | ----------- |
 |dat|         the dataset
 
-#### ops_dat_get_extents
+#### ops_dat_core::get_global_npartitions (C++)
+The C++ version of ``ops_dat_get_global_npartitions``, which does not require input.
+#### ops_dat_get_extents (C)
 
 __void ops_dat_get_extents(ops_dat dat, int part, int *disp, int *sizes)__
 
@@ -663,7 +669,10 @@ This routine returns the MPI displacement and size of a given chunk of the given
 |disp|        an array populated with the displacement of the chunk within the ``global'' distributed array|
 |sizes|       an array populated with the spatial extents|
 
-#### ops_dat_get_raw_metadata
+#### ops_dat_core::get_extents (C++)
+The C++ version of ``ops_dat_get_extents`` where the arguments are the same except no need of the ops_dat arguments.
+
+#### ops_dat_get_raw_metadata (C)
 
 __char* ops_dat_get_raw_metadata(ops_dat dat, int part, int *disp, int *size, int *stride, int *d_m, int *d_p)__
 
@@ -679,7 +688,9 @@ This routine returns array shape metadata corresponding to the ops_dat. Any of t
 |d_m|      an array populated with padding on the left in each dimension. Note that these are negative values|
 |d_p|      an array populated with padding on the right in each dimension|
 
-#### ops_dat_get_raw_pointer
+#### ops_dat_core::get_raw_metadata (C++)
+The C++ version of ``ops_dat_get_raw_metadata`` where the arguments are the same except no need of the ops_dat arguments.
+#### ops_dat_get_raw_pointer (C)
 
 __char* ops_dat_get_raw_pointer(ops_dat dat, int part, ops_stencil stencil, ops_memspace *memspace)__
 
@@ -692,7 +703,9 @@ This routine returns a pointer to the internally stored data, with MPI halo regi
 |stencil|     a stencil used to determine required MPI halo exchange depths|
 |memspace|       when set to OPS_HOST or OPS_DEVICE, returns a pointer to data in that memory space, otherwise must be set to 0, and returns whether data is in the host or on the device|
 
-#### ops_dat_release_raw_data
+#### ops_dat_core::get_raw__pointer (C++)
+The C++ version of ``ops_dat_get_raw_pointer`` where the arguments are the same except no need of the ops_dat arguments.
+#### ops_dat_release_raw_data (C)
 
 __void ops_dat_release_raw_data(ops_dat dat, int part, ops_access acc)__
 
@@ -706,34 +719,35 @@ A single call to ops_dat_release_raw_data() releases all pointers obtained by pr
 |part|        the chunk index (has to be 0)|
 |acc|     the kind of access that was used by the user (OPS_READ if it was read only, OPS_WRITE if it was overwritten, OPS_RW if it was read and written)|
 
-#### ops_dat_release_raw_data
+#### ops_dat_core::_release_raw_data (C++)
+The C++ version of ``ops_dat_release_raw_data`` where the arguments are the same except no need of the ops_dat arguments.
+#### ops_dat_fetch_data (C)
 
-__void ops_dat_release_raw_data_memspace(ops_dat dat, int part, ops_access acc, ops_memspace *memspace)__
-
-Indicates to OPS that a dataset previously accessed with ops_dat_get_raw_pointer is released by the user, and also tells OPS how it was accessed, and which memory space was used.
+__void ops_dat_fetch_data(ops_dat dat, int part, int *data)__
 
-A single call to ops_dat_release_raw_data() releases all pointers obtained by previous calls to ops_dat_get_raw_pointer() calls on the same dat and with the same *memspace argument, i.e. calls do not nest.
+This routine copies the data held by OPS to the user-specified memory location, which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
 
 | Arguments      | Description |
 | ----------- | ----------- |
 |dat|         the dataset|
-|part|        the chunk index (has to be 0)|
-|acc|     the kind of access that was used by the user (OPS_READ if it was read only, OPS_WRITE if it was overwritten, OPS_RW if it was read and written)|
-|memspace|       set to OPS_HOST or OPS_DEVICE |
+|part|        the chunk index (has to be 0) |
+|data|        pointer to memory which should be filled by OPS|
 
-#### ops_dat_fetch_data
+#### ops_dat_fetch_data_memspace (C)
 
-__void ops_dat_fetch_data(ops_dat dat, int part, int *data)__
+__void ops_dat_fetch_data_memspace(ops_dat dat, int part, char *data, ops_memspace memspace)__
 
-This routine copies the data held by OPS to the user-specified memory location, which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
+This routine copies the data held by OPS to the user-specified memory location, as which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
 
 | Arguments      | Description |
 | ----------- | ----------- |
 |dat|         the dataset|
 |part|        the chunk index (has to be 0) |
 |data|        pointer to memory which should be filled by OPS|
-
-#### ops_dat_set_data
+| memspace |the memory space where the data pointer is|
+#### ops_dat_core::fetch_data (C++)
+The C++ version of ``ops_dat_fetch_data_memspace`` where the arguments the same except no need of the ops_dat arguments.
+#### ops_dat_set_data (C)
 
 __void ops_dat_set_data(ops_dat dat, int part, int *data)__
 
@@ -745,7 +759,25 @@ This routine copies the data given  by the user to the internal data structure u
 |part|        the chunk index (has to be 0)|
 |data|        pointer to memory which should be copied to OPS |
 
-### Tridsolver Calls
+
+#### ops_dat_set_data_memspace (C)
+
+__void ops_dat_set_data_memspace(ops_dat dat, int part, char *data, ops_memspace memspace)__
+
+This routine copies the data given  by the user to the internal data structure used by OPS. User data needs to be laid out in column-major order and strided as indicated by the sizes parameter of ops_dat_get_extents.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|data|        pointer to memory which should be copied to OPS |
+|memspace| the memory space where the data pointer is|
+
+#### ops_dat_core::set_data (C++)
+The C++ version of ``ops_dat_set_data_memspace`` where the arguments the same except no need of the ops_dat arguments.
+### Linear algebra solvers
+
+####  Tridiagonal solver
 This section specifies APIs that allow [Tridsolver](https://github.com/OP-DSL/tridsolver) (a tridiagonal solver library) to be called from OPS. The library can be used to solve a large number of tridiagonal systems of equations stored in multidimensional datasets. Parameters that are passed to Tridsolver from OPS are stored in an `ops_tridsolver_params` object. The constructor for this class takes the `ops_block` that the datasets are defined over as an argument and optionally also a solving strategy to use (only relevant to MPI applications). The following solving strategies are available (see Tridsolver for more details about these):
 
 - GATHER_SCATTER (not available for GPUs)
@@ -757,7 +789,7 @@ This section specifies APIs that allow [Tridsolver](https://github.com/OP-DSL/tr
 
 Then parameters specific to different solving strategies can be set using setter methods. For applications using MPI, it is beneficial to reuse `ops_tridsolver_params` objects between solves as much as possible due to set up times involved with creating Tridsolver's MPI communicators.
 
-#### ops_tridMultiDimBatch
+##### ops_tridMultiDimBatch
 
 __void ops_tridMultiDimBatch(int ndim, int solvedim, int* dims, ops_dat a, ops_dat b, ops_dat c, ops_dat d, ops_tridsolver_params *tridsolver_ctx)__
 
@@ -774,7 +806,7 @@ This solves multiple tridiagonal systems of equations in multidimensional datase
 |d| the dataset for the right hand side, also where the solution is written to |
 |tridsolver_ctx| an object containing the parameters for the Tridsolver library |
 
-#### ops_tridMultiDimBatch_Inc
+##### ops_tridMultiDimBatch_Inc
 
 __void ops_tridMultiDimBatch(int ndim, int solvedim, int* dims, ops_dat a, ops_dat b, ops_dat c, ops_dat d, ops_dat u, ops_tridsolver_params *tridsolver_ctx)__
 

From 2b0288321a2f5d29169430ccb4e1e8f687736287 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 5 Nov 2021 08:08:41 +0000
Subject: [PATCH 321/324] Update opsapi.md

---
 doc/opsapi.md | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index 4ab720fceb..e56d336ab9 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -828,12 +828,24 @@ This solves multiple tridiagonal systems of equations in multidimensional datase
 
 The following is a list of all the runtime flags and options that can be used when executing OPS generated applications.
 ### General flags
-* `OPS_DIAGS=`
-* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=`
-* `-gpudirect`
-* `OPS_CL_DEVICE=`
-* `OPS_TILING`
-* `OPS_TILING_MAXDEPTH=`
+* `OPS_DIAGS=` : set OPS diagnostics level at runtime. 
+
+  `OPS_DIAGS=1` - no diagnostics, default level to achieve the best runtime performance.
+   
+  `OPS_DIAGS>1` - print block decomposition and `ops_par_loop` timing breakdown.
+  
+  `OPS_DIAGS>4` - print intra-block halo buffer allocation feedback (for OPS internal development only).
+  
+  `OPS_DIAGS>5` - check if intra-block halo MPI sends depth match MPI receives depth (for OPS internal development only).  
+  
+* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=` : The CUDA (and OpenCL) thread block sizes in X, Y and Z dimensions. The sizes should be an integer between 1 - 1024, and currently they should be selected such that `OPS_BLOCK_SIZE_X`*`OPS_BLOCK_SIZE_Y`*`OPS_BLOCK_SIZE_Z`< 1024
+
+* `-gpudirect` : Enable GPU direct support when executing MPI+CUDA executables. 
+
+* `OPS_CL_DEVICE=` : Select the OpenCL device for execution. Usually `OPS_CL_DEVICE=0` selects the CPU and `OPS_CL_DEVICE=1` selects GPUs. The selected device will be reported by OPS during execution.
+
+* `OPS_TILING` : Execute OpenMP code with cache blocking tiling. See the [Performance Tuning](https://github.com/OP-DSL/OPS/blob/MarkdownDocDev/doc/perf.md) section. 
+* `OPS_TILING_MAXDEPTH=` : Execute MPI+OpenMP code with cache blocking tiling and further communication avoidance. See the [Performance Tuning](https://github.com/OP-DSL/OPS/blob/MarkdownDocDev/doc/perf.md) section. 
 
 ## Doxygen
 Doxygen generated from OPS source can be found [here](https://op-dsl-ci.gitlab.io/ops-ci/).

From c88a3e3bf9c9a4a9d943afb5060692f09f776ecc Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Fri, 5 Nov 2021 08:09:34 +0000
Subject: [PATCH 322/324] Update opsapi.md

---
 doc/opsapi.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/opsapi.md b/doc/opsapi.md
index e56d336ab9..1562380e12 100644
--- a/doc/opsapi.md
+++ b/doc/opsapi.md
@@ -827,7 +827,7 @@ This solves multiple tridiagonal systems of equations in multidimensional datase
 ## Runtime Flags and Options
 
 The following is a list of all the runtime flags and options that can be used when executing OPS generated applications.
-### General flags
+
 * `OPS_DIAGS=` : set OPS diagnostics level at runtime. 
 
   `OPS_DIAGS=1` - no diagnostics, default level to achieve the best runtime performance.

From 663ee58be84f8332f632f40662feff5709e716c9 Mon Sep 17 00:00:00 2001
From: "Gihan R. Mudalige" <g.mudalige@warwick.ac.uk>
Date: Mon, 8 Nov 2021 21:56:35 +0000
Subject: [PATCH 323/324] Update README

---
 doc/README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/README b/doc/README
index e941abeaa5..5dff700d96 100644
--- a/doc/README
+++ b/doc/README
@@ -1,2 +1,2 @@
 Latest documentation can be found on: 
-https://op-dsl.github.io/
+https://ops-dsl.readthedocs.io/en/latest/

From cf486c23c9f30947baf3ef97ea00d993ade02e6e Mon Sep 17 00:00:00 2001
From: Istvan Reguly <regulyistvan@gmail.com>
Date: Mon, 8 Nov 2021 21:59:14 +0100
Subject: [PATCH 324/324] Python3 converion of codegen

---
 ops_translator/c/ops.py                               | 4 ++--
 ops_translator/c/ops_gen_mpi_cuda.py                  | 5 +++--
 ops_translator/c/ops_gen_mpi_hip.py                   | 3 ++-
 ops_translator/c/ops_gen_mpi_inline.py                | 5 +++--
 ops_translator/c/ops_gen_mpi_lazy.py                  | 3 ++-
 ops_translator/c/ops_gen_mpi_openacc.py               | 3 ++-
 ops_translator/c/ops_gen_mpi_opencl.py                | 5 +++--
 ops_translator/fortran/ops_fortran.py                 | 2 +-
 ops_translator/fortran/ops_fortran_gen_mpi.py         | 3 ++-
 ops_translator/fortran/ops_fortran_gen_mpi_cuda.py    | 3 ++-
 ops_translator/fortran/ops_fortran_gen_mpi_openacc.py | 3 ++-
 ops_translator/fortran/ops_fortran_gen_mpi_openmp.py  | 3 ++-
 12 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/ops_translator/c/ops.py b/ops_translator/c/ops.py
index e2a9ac0b39..d010286ab5 100755
--- a/ops_translator/c/ops.py
+++ b/ops_translator/c/ops.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python3
 
 # Open source copyright declaration based on BSD open source template:
 # http://www.opensource.org/licenses/bsd-license.php
@@ -643,7 +643,7 @@ def main(source_files):
               repeat = True
               which_file = nk
             else:
-              print('repeated kernel with incompatible arguments: ERROR' + kernels[nk]['name'])
+              print(('repeated kernel with incompatible arguments: ERROR' + kernels[nk]['name']))
               break
 
 
diff --git a/ops_translator/c/ops_gen_mpi_cuda.py b/ops_translator/c/ops_gen_mpi_cuda.py
index c3807981cf..9e0d9e3567 100644
--- a/ops_translator/c/ops_gen_mpi_cuda.py
+++ b/ops_translator/c/ops_gen_mpi_cuda.py
@@ -49,6 +49,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -97,7 +98,7 @@ def ops_gen_mpi_cuda(master, date, consts, kernels, soa_set):
   try:
     os.makedirs('./CUDA')
   except OSError as e:
-    if e.errno != os.errno.EEXIST:
+    if e.errno != errno.EEXIST:
       raise
   for nk in range (0,len(kernels)):
     arg_typ  = kernels[nk]['arg_type']
@@ -694,7 +695,7 @@ def ops_gen_mpi_cuda(master, date, consts, kernels, soa_set):
       code('')
 
     for n in range (0, nargs):
-      if arg_typ[n] == 'ops_arg_gbl' and accs[n] <> OPS_READ:
+      if arg_typ[n] == 'ops_arg_gbl' and accs[n] != OPS_READ:
         code('arg'+str(n)+'.data = block->instance->OPS_reduct_h + reduct_bytes;')
         code('arg'+str(n)+'.data_d = block->instance->OPS_reduct_d + reduct_bytes;')
         code('for (int b=0; b<maxblocks; b++)')
diff --git a/ops_translator/c/ops_gen_mpi_hip.py b/ops_translator/c/ops_gen_mpi_hip.py
index da5c81505c..f3c59ddbaf 100644
--- a/ops_translator/c/ops_gen_mpi_hip.py
+++ b/ops_translator/c/ops_gen_mpi_hip.py
@@ -49,6 +49,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -676,7 +677,7 @@ def ops_gen_mpi_hip(master, date, consts, kernels, soa_set):
       code('')
 
     for n in range (0, nargs):
-      if arg_typ[n] == 'ops_arg_gbl' and accs[n] <> OPS_READ:
+      if arg_typ[n] == 'ops_arg_gbl' and accs[n] != OPS_READ:
         code('arg'+str(n)+'.data = block->instance->OPS_reduct_h + reduct_bytes;')
         code('arg'+str(n)+'.data_d = block->instance->OPS_reduct_d + reduct_bytes;')
         code('for (int b=0; b<maxblocks; b++)')
diff --git a/ops_translator/c/ops_gen_mpi_inline.py b/ops_translator/c/ops_gen_mpi_inline.py
index 0d620d4161..be5c5f0d2d 100644
--- a/ops_translator/c/ops_gen_mpi_inline.py
+++ b/ops_translator/c/ops_gen_mpi_inline.py
@@ -51,6 +51,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -94,7 +95,7 @@ def ops_gen_mpi_inline(master, date, consts, kernels, soa_set):
   try:
     os.makedirs('./MPI_inline')
   except OSError as e:
-    if e.errno != os.errno.EEXIST:
+    if e.errno != errno.EEXIST:
       raise
 
   ##########################################################################
@@ -472,7 +473,7 @@ def ops_gen_mpi_inline(master, date, consts, kernels, soa_set):
     try:
       os.makedirs('./MPI_inline')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./MPI_inline/'+name+'_mpiinline_kernel_c.c','w')
     date = datetime.datetime.now()
diff --git a/ops_translator/c/ops_gen_mpi_lazy.py b/ops_translator/c/ops_gen_mpi_lazy.py
index e80af5df9f..724b5cf7d6 100755
--- a/ops_translator/c/ops_gen_mpi_lazy.py
+++ b/ops_translator/c/ops_gen_mpi_lazy.py
@@ -49,6 +49,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -108,7 +109,7 @@ def ops_gen_mpi_lazy(master, date, consts, kernels, soa_set):
   try:
     os.makedirs('./MPI_OpenMP')
   except OSError as e:
-    if e.errno != os.errno.EEXIST:
+    if e.errno != errno.EEXIST:
       raise
 
   for nk in range (0,len(kernels)):
diff --git a/ops_translator/c/ops_gen_mpi_openacc.py b/ops_translator/c/ops_gen_mpi_openacc.py
index c25559330b..0d68374e05 100644
--- a/ops_translator/c/ops_gen_mpi_openacc.py
+++ b/ops_translator/c/ops_gen_mpi_openacc.py
@@ -49,6 +49,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -97,7 +98,7 @@ def ops_gen_mpi_openacc(master, date, consts, kernels, soa_set):
   try:
     os.makedirs('./OpenACC')
   except OSError as e:
-    if e.errno != os.errno.EEXIST:
+    if e.errno != errno.EEXIST:
       raise
   for nk in range (0,len(kernels)):
     arg_typ  = kernels[nk]['arg_type']
diff --git a/ops_translator/c/ops_gen_mpi_opencl.py b/ops_translator/c/ops_gen_mpi_opencl.py
index 4ba4875dc2..3b995fcda2 100644
--- a/ops_translator/c/ops_gen_mpi_opencl.py
+++ b/ops_translator/c/ops_gen_mpi_opencl.py
@@ -50,6 +50,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -107,7 +108,7 @@ def ops_gen_mpi_opencl(master, date, consts, kernels, soa_set):
   try:
     os.makedirs('./OpenCL')
   except OSError as e:
-    if e.errno != os.errno.EEXIST:
+    if e.errno != errno.EEXIST:
       raise
   for nk in range (0,len(kernels)):
     arg_typ  = kernels[nk]['arg_type']
@@ -498,7 +499,7 @@ def ops_gen_mpi_opencl(master, date, consts, kernels, soa_set):
     try:
       os.makedirs('./OpenCL')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./OpenCL/'+name+'.cl','w')
     date = datetime.datetime.now()
diff --git a/ops_translator/fortran/ops_fortran.py b/ops_translator/fortran/ops_fortran.py
index c0abf8a5e1..b811befab2 100755
--- a/ops_translator/fortran/ops_fortran.py
+++ b/ops_translator/fortran/ops_fortran.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Open source copyright declaration based on BSD open source template:
 # http://www.opensource.org/licenses/bsd-license.php
diff --git a/ops_translator/fortran/ops_fortran_gen_mpi.py b/ops_translator/fortran/ops_fortran_gen_mpi.py
index 44f2cb97f1..c6e0413258 100644
--- a/ops_translator/fortran/ops_fortran_gen_mpi.py
+++ b/ops_translator/fortran/ops_fortran_gen_mpi.py
@@ -50,6 +50,7 @@
 
 import re
 import datetime
+import errno
 import os
 
 import util_fortran
@@ -484,7 +485,7 @@ def ops_fortran_gen_mpi(master, date, consts, kernels):
     try:
       os.makedirs('./MPI')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./MPI/'+name+'_seq_kernel.F90','w')
     date = datetime.datetime.now()
diff --git a/ops_translator/fortran/ops_fortran_gen_mpi_cuda.py b/ops_translator/fortran/ops_fortran_gen_mpi_cuda.py
index 13b19a3f18..42cbdcaafd 100644
--- a/ops_translator/fortran/ops_fortran_gen_mpi_cuda.py
+++ b/ops_translator/fortran/ops_fortran_gen_mpi_cuda.py
@@ -50,6 +50,7 @@
 
 import re
 import datetime
+import errno
 import os
 
 import util_fortran
@@ -1099,7 +1100,7 @@ def ops_fortran_gen_mpi_cuda(master, date, consts, kernels):
     try:
       os.makedirs('./CUDA')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./CUDA/'+name+'_cuda_kernel.CUF','w')
     date = datetime.datetime.now()
diff --git a/ops_translator/fortran/ops_fortran_gen_mpi_openacc.py b/ops_translator/fortran/ops_fortran_gen_mpi_openacc.py
index cef0bf362b..854cbdf5a0 100644
--- a/ops_translator/fortran/ops_fortran_gen_mpi_openacc.py
+++ b/ops_translator/fortran/ops_fortran_gen_mpi_openacc.py
@@ -50,6 +50,7 @@
 
 import re
 import datetime
+import errno
 import os
 
 import util_fortran
@@ -565,7 +566,7 @@ def ops_fortran_gen_mpi_openacc(master, date, consts, kernels):
     try:
       os.makedirs('./MPI_OpenACC')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./MPI_OpenACC/'+name+'_openacc_kernel.F90','w')
     date = datetime.datetime.now()
diff --git a/ops_translator/fortran/ops_fortran_gen_mpi_openmp.py b/ops_translator/fortran/ops_fortran_gen_mpi_openmp.py
index 5191d657fb..34903d9a58 100644
--- a/ops_translator/fortran/ops_fortran_gen_mpi_openmp.py
+++ b/ops_translator/fortran/ops_fortran_gen_mpi_openmp.py
@@ -49,6 +49,7 @@
 """
 
 import re
+import errno
 import datetime
 import os
 
@@ -515,7 +516,7 @@ def ops_fortran_gen_mpi_openmp(master, date, consts, kernels):
     try:
       os.makedirs('./MPI_OpenMP')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./MPI_OpenMP/'+name+'_omp_kernel.F90','w')
     date = datetime.datetime.now()