diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ecb22b0399..239f41b92d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -255,19 +255,33 @@ test:Intel:
   only:
     - master
 
-#Stage "docs"
-make-docs:
+# #Stage "docs"
+# make-docs:
+#   stage: docs
+#   when: always
+#   tags:
+#     - CCP, test
+#   script:
+#     - cd doc && make all clean
+#   artifacts:
+#     expire_in: 4 week
+#     paths:
+#       - doc/user.pdf
+#       - doc/ops/html
+#       - doc/ops/latex/refman.pdf
+#       - doc/ops_translator/html
+#       - doc/ops_translator/latex/refman.pdf
+
+pages:
   stage: docs
   when: always
   tags:
     - CCP, test
   script:
-    - cd doc && make all clean
+  - cd doc
+  - doxygen ops/Doxyfile
+  - mv ops/html/ $CI_PROJECT_DIR/public/
   artifacts:
-    expire_in: 4 week
     paths:
-      - doc/user.pdf
-      - doc/ops/html
-      - doc/ops/latex/refman.pdf
-      - doc/ops_translator/html
-      - doc/ops_translator/latex/refman.pdf
\ No newline at end of file
+    - public
+
diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 0000000000..ceca0e737d
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,13 @@
+# File: .readthedocs.yaml
+
+version: 2
+
+# Build from the docs/ directory with Sphinx
+sphinx:
+  configuration: doc/conf.py
+
+# Explicitly set the version of Python and its requirements
+python:
+  version: 3.8
+  install:
+    - requirements: doc/requirement.txt
\ No newline at end of file
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000000..2d43b90743
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,10 @@
+  
+List of Authors 
+
+Mike Giles
+Gihan Mudalige
+Istvan Reguly
+Daniel Balogh
+Toby Flynn
+Satya Jammy
+Jianping Meng 
diff --git a/README b/README
deleted file mode 100644
index f7aa601f52..0000000000
--- a/README
+++ /dev/null
@@ -1,75 +0,0 @@
-OPS is an API with associated libraries and pre-processors to generate
-parallel executables for applications on mulit-block structured grids.
-
-This repository contains the implementation of the run-time library
-and the pre-processor, and is structured as follows:
-
-|
-`- ops: Implementation of the user and run-time OPS C/C++ APIs
-|
-`- apps: Application examples in C and Fortran
-|  These are examples of user application code and also include
-|  the target code an OPS pre-processor should produce to correctly
-|  use the OPS run-time library.
-|
-`- translator: Python OPS pre-processor for C/C++ API
-|
-`- doc: Documentation
-
-Installation
-============
-
-1. Set up environmental variables:
-
-  OPS_COMPILER - compiler to be used (Currently supports Intel, PGI and 
-  Cray compilers, but others can be easily incorporated by extending the 
-  Makefiles used in step 2 and 3)
-
-  OPS_INSTALL_PATH - Installation directory of OPS/ops
-  
-  CUDA_INSTALL_PATH - Installation directory of CUDA, 
-  usually /usr/local/cuda (to build CUDA libs and applications)
-
-  OPENCL_INSTALL_PATH - Installation directory of OpenCL, 
-  usually /usr/local/cuda for NVIDIA OpenCL implementation 
-  (to build OpenCL libs and applications)
-
-  MPI_INSTALL_PATH - Installation directory of MPI (to build MPI 
-  based distributed memory libs and applications)
-
-  HDF5_INSTALL_PATH - Installation directory of HDF5 
-  (to support HDF5 based File I/O)
-
-  See example scripts (e.g. source_intel, source_pgi_15.10, source_cray) 
-  under OPS/ops/ that sets up the environment for building with various 
-  compilers (Intel, PGI, Cray).
-
-2. Build OPS back-end libraries.
-
-  For C/C++ back-end use Makefile under OPS/ops/c (modify Makefile if required). 
-  The libraries will be built in OPS/ops/c/lib
-  
-  cd $OPS_INSTALL_PATH/c
-  make
-
-  
-  For Fortran back-end use Makefile under OPS/ops/fortran 
-  (modify Makefile if required). The libraries will be built in OPS/ops/fortran/lib
-  
-  cd $OPS_INSTALL_PATH/fortran
-  make
-  
-
-3. Build OPS example applications
-
-  For example to build CloverLeaf_3D under OPS/apps/c/CloverLeaf_3D
-  
-  cd ../apps/c/Cloverleaf_3D/
-  make
-  
-
-How to cite
-===========
-Istvan Z Reguly, G.R Mudalige, Mike B Giles. Loop Tiling in Large-Scale 
-Stencil Codes at Run-time with OPS. (2017) IEEE Transactions on Parallel 
-and Distributed Systems. (http://dx.doi.org/10.1109/TPDS.2017.2778161)
diff --git a/README.md b/README.md
index a75182274c..89ddd03dd3 100644
--- a/README.md
+++ b/README.md
@@ -1,117 +1,57 @@
-## OPS
+# OPS
 
-OPS is an API with associated libraries and pre-processors to generate
-parallel executables for applications on multi-block structured grids.
+OPS (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes. The OPS API is embedded in C/C++ and Fortran.
 
 
-This repository contains the implementation of the run-time library
-and the pre-processor, and is structured as follows:
+[![Build Status](https://gitlab.com/op-dsl-ci/ops-ci/badges/master/pipeline.svg)](https://gitlab.com/op-dsl-ci/ops-ci) 
+[![Documentation Status](https://readthedocs.org/projects/ops-dsl/badge/?version=latest)](https://ops-dsl.readthedocs.io/en/latest/?badge=latest)
 
-* ops: Implementation of the user and run-time OPS C/C++ APIs
+This repository contains the implementation of the back-end library and the code-generator, and is structured as follows:
 
-* apps: Application examples in C.
-  These are examples of user application code and also include
-  the target code an OPS pre-processor should produce to correctly
-  use the OPS run-time library.
-  Currently the main application developed with OPS is a single
-  block structured mesh application - Cloverleaf originally
-  developed at https://github.com/Warwick-PCAV/CloverLeaf
+* `ops`: Implementation of the user and run-time OPS C/C++ APIs
+* `apps`: Application examples in C.
+  These are examples of user application code and also include the target parallel code generated by the OPS code generator.
+* `ops_translator`: Python OPS code generator for C/C++ API
+* `scripts` : example scripts for setting environmental variables and testing applications
+* `cmake` : cmake installation files
+* `makefiles` : makefile based installation files
+* `doc`: Documentation
 
-* translator: Python OPS pre-processor for C/C++ API
+## Documentation
 
-* doc: Documentation
+OPS documentation can be viewed on [Read the Docs](https://ops-dsl.readthedocs.io/).
 
-#### Installation
+## Citing
+To cite OPS, please reference the following paper:
 
-**Note: The current CMakefile and relevant instructions are mainly tested on linux-based systems including Windows Subsystem for Linux**
+[I. Z. Reguly, G. R. Mudalige and M. B. Giles, Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS, in IEEE Transactions on Parallel and Distributed Systems, vol. 29, no. 4, pp. 873-886, 1 April 2018, doi: 10.1109/TPDS.2017.2778161.](https://ieeexplore.ieee.org/abstract/document/8121995)
 
-##### Dependencies
+```
+@ARTICLE{Reguly_et_al_2018,
+  author={Reguly, István Z. and Mudalige, Gihan R. and Giles, Michael B.},
+  journal={IEEE Transactions on Parallel and Distributed Systems}, 
+  title={Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS}, 
+  year={2018},
+  volume={29},
+  number={4},
+  pages={873-886},
+  doi={10.1109/TPDS.2017.2778161}}
+```
 
-  * CMake
+## Support and Contact
+The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by contacting the the [OP-DSL team](https://op-dsl.github.io/about.html).
 
-  CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
-  ```bash
-  version=3.19.0
-  wget https://github.com/Kitware/CMake/releases/download/v$version/cmake-$version-Linux-x86_64.sh
-  # Assume that CMake is going to be installed at /usr/local/cmake
-  cmake_dir=/usr/local/cmake
-  # sudo is not necessary for directories in user space.
-  sudo mkdir $cmake_dir
-  sudo sh ./cmake-$version-Linux-x86_64.sh --prefix=$cmake_dir  --skip-license
-  sudo ln -s $cmake_dir/bin/cmake /usr/local/bin/cmake
-  ```
+## Contributing
 
-  * Python2
+To contribute to OPS please use the following steps :
 
-  **Python2** is required by the OPS Python translator. The CMake build system will try to identify it automatically. However, the process can fail sometime (e.g., if there are both Python2 and Python3 installed). If this happens, the path to Python2 can be specified manually by using **-DPython2_EXECUTABLE** when invoking CMake
+1. Clone this repository (on your local system)
+2. Create a new branch in your cloned repository
+3. Make changes / contributions in your new branch
+4. Submit your changes by creating a Pull Request to the `develop` branch of the OPS repository
 
-  * HDF5
+The contributions in the `develop` branch will be merged into the master branch as we create a new release.
 
-  [HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using -DHDF5_ROOT.
-
-  * CUDA
-
-  The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  please use -DCUDA_TOOLKIT_ROOT_DIR to manually specify the path.
-
-<!-- 1. Set up environmental variables:
-
-  * `CUDA_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications, only needed if CUDA cannot be found in standard locations, or to enable OpenCL)
-  * `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
-  * `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
-
-
-##### Build OPS back-end libraries example applications
-###### Build the library and example applications together
-
-  Create a build directory, and run CMake (version 3.18 or newer)
-  ```bash
-  mkdir build
-  cd build
-  # Please see below for CMake options
-  cmake ${PATH_TO_OPS} -DBUILD_OPS_APPS=ON -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL -DGPU_NUMBER=1
-  make # IEEE=1 this option is important for applications to get accurate results
-  make install # sudo is needed if a directory like /usr/local/ is chosen.
-  ```
-After installation, the library and the python translator can be found at the direcory specified by CMAKE_INSTALL_PREFIX, together with the executable files for applications at APP_INSTALL_DIR.
-
-######  Build the library and example applications separately
-
-In this mode, the library can be firstly built and installed as
-
-```bash
-  mkdir build
-  cd build
-  # Please see below for CMake options
-  cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
-  make # IEEE=1 this option is important for applications to get accurate results
-  make install # sudo is needed if a system direction is chosen,
-  ```
-then the application can be built as
-
-```bash
-  mkdir appbuild
-  cd appbuild
-  # Please see below for CMake options
-  cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
-  make # IEEE=1 this option is important for applications to get accurate results
-  ```
-###### Tests
-
-A few tasks for testing codes can be run by
-```bash
-  make test
-  ```
-The current tests are mainly based on the applications.
-###### Options of interest to specify to `cmake` include:
-
-  * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
-  * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
-  * `-DOPS_TEST=ON` - enable the tests
-  * `-DCMAKE_INSTALL_PREFIX=` - specify the installation direction for the library (/usr/local by default, Library CMake only)
-  * `-DAPP_INSTALL_DIR=` - specify the installation direction for the applications ($HOME/OPS-APPS by default)
-  * `-DGPU_NUMBER=` - specify the number of GPUs used in the tests
-  * `-DOPS_INSTALL_DIR=` - specify where the OPS library is installed (Application CMake only, see [here](#build-the-library-and-example-applications-separately))
-  * `-DOPS_VERBOSE_WARNING=ON` - show verbose output during building process
-  <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
-  <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
+## License 
+OPS is released as an open-source project under the BSD 3-Clause License. See the file called [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) for more information.
 
diff --git a/doc/Makefile b/doc/Makefile
index 3a6807783a..7e68d6ad2c 100755
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -8,28 +8,14 @@
 
 .PHONY : all user doxygen clean distclean
 
-all : user doxygen
-
-user :
-	pdflatex --shell-escape user.tex
-	pdflatex --shell-escape user.tex
-	-bibtex user
-	pdflatex --shell-escape user.tex
-	latex_count=8 ; \
-	while egrep -s 'Rerun (LaTeX|to get cross-references right)' user.log && [ $$latex_count -gt 0 ] ;\
-	    do \
-	      echo "Rerunning latex...." ;\
-	      pdflatex --shell-escape user.tex ;\
-	      latex_count=`expr $$latex_count - 1` ;\
-	    done
-
+all : doxygen
 doxygen :
 	doxygen ops/Doxyfile
 	cd ops/latex; make refman.pdf
 	doxygen ops_translator/Doxyfile
 	cd ops_translator/latex; make refman.pdf
 
-clean : 
+clean :
 	-rm -f *.out *.aux *.blg *.pyg.* *.log *.backup *.toc *~ *.bbl
 	-rm -rf _minted-user
 
diff --git a/doc/README b/doc/README
index e941abeaa5..5dff700d96 100644
--- a/doc/README
+++ b/doc/README
@@ -1,2 +1,2 @@
 Latest documentation can be found on: 
-https://op-dsl.github.io/
+https://ops-dsl.readthedocs.io/en/latest/
diff --git a/doc/apps.md b/doc/apps.md
new file mode 100644
index 0000000000..0b8fca0efb
--- /dev/null
+++ b/doc/apps.md
@@ -0,0 +1,11 @@
+# Examples
+
+See `OPS/apps/[c|fortran]/[application]/test.sh` on compiling and running various parallel versions generated by OPS for each application.
+
+Further documentation under construction. 
+
+<!-- ## CloverLeaf 2D 
+## CloverLeaf 3D with HDF5
+## poisson
+## adi
+-->
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000000..ac5f42dbae
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,63 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+import subprocess
+#subprocess.call('doxygen ops/Doxyfile', shell=True)
+#subprocess.call('cp ops/html/ . -r', shell=True)
+#html_extra_path = ['ops/html']
+
+# -- Project information -----------------------------------------------------
+
+project = 'Oxford Parallel library for Structured mesh solvers'
+copyright = '2013, Gihan Mudalige, Istvan Reguly, Mike Giles and others'
+author = "Mike Giles, Istvan Reguly, Gihan Mudalige"
+
+# The full version, including alpha/beta/rc tags
+release = 'latest'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.mathjax',
+    'sphinx.ext.ifconfig',
+    'myst_parser'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+
+source_suffix = ['.rst', '.md']
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme" #'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
diff --git a/doc/devanapp.md b/doc/devanapp.md
new file mode 100644
index 0000000000..40607e76a4
--- /dev/null
+++ b/doc/devanapp.md
@@ -0,0 +1,280 @@
+# Developing an OPS Application
+This page provides a tutorial in the basics of using OPS for multi-block structured mesh application development. This is taken from a [presentation](https://op-dsl.github.io/docs/OPS/tutorial.pdf) given initially in April 2018 and subsequently updated for the latest release of OPS. 
+
+## OPS Abstraction
+OPS is a Domain Specific Language embedded in C/C++ and Fortran, targeting the development of multi-block structured mesh computations. The abstraction has two distinct components:  the definition of the mesh, and operations over the mesh.
+* Defining a number of 1-3D blocks, and on them a number of datasets, which have specific extents in the different dimensions.
+* Describing a parallel loop over a given block, with a given iteration range, executing a given "kernel function" at each mesh point, and describing what datasets are going to be accessed and how.
+* Additionally, one needs to declare stencils (access patterns) that will be used in parallel loops to access datasets, and any global constants (read-only global scope variables)
+
+Data and computations expressed this way can be automatically managed and parallelised by the OPS library. Higher dimensions are supported in the backend, but not currently by the code generators.
+
+## Example Application
+In this tutorial we will use an example application, a simple 2D iterative Laplace equation solver. 
+* Go to the `OPS/apps/c/laplace2dtutorial/original` directory
+* Open the `laplace2d.cpp` file
+* It uses an $imax$ x $jmax$ mesh, with an additional 1 layers of boundary cells on all sides
+* There are a number of loops that set the boundary conditions along the four edges
+* The bulk of the simulation is spent in a whilel oop, repeating a stencil kernel with a maximum reduction, and a copy kernel
+* Compile and run the code !
+
+## Original - Initialisation
+The original code begins with initializing the data arrays used in the calculation:
+```
+//Size along y
+int jmax = 4094;
+//Size along x
+int imax = 4094;
+//Size along x
+int iter_max = 100;
+
+double pi  = 2.0 * asin(1.0);
+const double tol = 1.0e-6;
+double error     = 1.0;
+
+double *A;
+double *Anew;
+double *y0;
+
+A    = (double *)malloc((imax+2) * (jmax+2) * sizeof(double));
+Anew = (double *)malloc((imax+2) * (jmax+2) * sizeof(double));
+y0   = (double *)malloc((imax+2) * sizeof(double));
+
+memset(A, 0, (imax+2) * (jmax+2) * sizeof(double));
+```
+## Original - Boundary loops
+The application sen sets boundary conditions:
+```
+for (int i = 0; i < imax+2; i++)
+    A[(0)*(imax+2)+i]   = 0.0;
+
+for (int i = 0; i < imax+2; i++)
+    A[(jmax+1)*(imax+2)+i] = 0.0;
+
+for (int j = 0; j < jmax+2; j++) {
+    A[(j)*(imax+2)+0] = sin(pi * j / (jmax+1));
+}
+
+for (int j = 0; j < imax+2; j++) {
+    A[(j)*(imax+2)+imax+1] = sin(pi * j / (jmax+1))*exp(-pi);
+}
+```  
+Note how in the latter two loops the loop index is used.
+
+## Original - Main iteration
+The main iterative loop is a while loop iterating until the error tolarance is at a set level and the number of iterations are les than the maximum set. 
+```
+while ( error > tol && iter < iter_max ) {
+  error = 0.0;
+  for( int j = 1; j < jmax+1; j++ ) {
+    for( int i = 1; i < imax+1; i++) {
+      Anew[(j)*(imax+2)+i] = 0.25f * 
+      ( A[(j)*(imax+2)+i+1] + A[(j)*(imax+2)+i-1]
+      + A[(j-1)*(imax+2)+i] + A[(j+1)*(imax+2)+i]);
+      error = fmax( error, fabs(Anew[(j)*(imax+2)+i]-A[(j)*(imax+2)+i]));
+    }
+  }
+  for( int j = 1; j < jmax+1; j++ ) {
+    for( int i = 1; i < imax+1; i++) {
+      A[(j)*(imax+2)+i] = Anew[(j)*(imax+2)+i];    
+    }
+  }
+  if(iter % 10 == 0) printf("%5d, %0.6f\n", iter, error);        
+  iter++;
+}
+```
+## Build OPS
+Build OPS using instructions in the [Getting Started](https://ops-dsl.readthedocs.io/en/markdowndocdev/installation.html#getting-started) page. 
+
+## Step 1 - Preparing to use OPS
+Firstly, include the appropriate header files, then initialise OPS, and at the end finalise it.
+* Define that this application is 2D, include the OPS header file, and create a header file where the outlined "elemental kernels" will live.
+```
+#define OPS_2D
+#include <ops_seq.h>
+#include "laplace_kernels.h" 
+```
+* Initialise and finalise OPS
+```  
+int main(int argc, const char** argv) {
+  //Initialise the OPS library, passing runtime args, and setting diagnostics level to low (1)
+  ops_init(argc, argv,1);
+  ...
+  ...
+  //Finalising the OPS library
+  ops_exit();
+}  
+```  
+By this point you need OPS set up - take a look at the Makefile in step1, and observe that the include and library paths are added, and we link against `ops_seq`.
+
+## Step 2 - OPS declarations
+Now declare a block and data on the block :
+```
+//The 2D block
+ops_block block = ops_decl_block(2, "my_grid");
+
+//The two datasets
+int size[] = {imax, jmax};
+int base[] = {0,0};
+int d_m[] = {-1,-1};
+int d_p[] = {1,1};
+ops_dat d_A    = ops_decl_dat(block, 1, size, base,
+                               d_m, d_p, A,    "double", "A");
+ops_dat d_Anew = ops_decl_dat(block, 1, size, base,
+                               d_m, d_p, Anew, "double", "Anew");
+```
+Data sets have a size (number of mesh points in each dimension). There is passing for halos or boundaries in the positive (`d_p`) and negative directions (`d_m`). Here we use a 1 thick boundary layer. Base index can be defined as it may be different from 0 (e.g. in Fortran). Item these with a 0 base index and a 1 wide halo, these datasets can be indexed from −1 tosize +1.
+
+OPS supports gradual conversion of applications to its API, but in this case the described data sizes will need to match:  the allocated memory and its extents need to be correctly described to OPS. In this example we have two `(imax+ 2) ∗ (jmax+ 2)` size arrays, and the total size in each dimension needs to matchsize `[i] + d_p[i] − d_m[i]`.  This is only supported for the sequential and OpenMP backends. If a `NULL` pointer is passed, OPS will allocate the data internally.
+
+We also need to declare the stencils that will be used - in this example most loops use a simple 1-point stencil, and one uses a 5-point stencil:
+```
+//Two stencils, a 1-point, and a 5-point
+int s2d_00[] = {0,0};
+ops_stencil S2D_00 = ops_decl_stencil(2,1,s2d_00,"0,0");
+int s2d_5pt[] = {0,0, 1,0, -1,0, 0,1, 0,-1};
+ops_stencil S2D_5pt = ops_decl_stencil(2,5,s2d_5pt,"5pt");
+```  
+Different names may be used for stencils in your code, but we suggest using some convention.
+
+## Step 3 - First parallel loop
+You can now convert the first loop to use OPS:
+```
+for (int i = 0; i < imax+2; i++)
+    A[(0)*(imax+2)+i]   = 0.0;
+```    
+This is a loop on the ottom boundary of the domain, which is at the −1 index for our dataset, therefore our iteration range will be over the entire domain, including halos in the X direction, and the bottom boundary in the Y direction.  The iteration range is given as beginning (inclusive) and end (exclusive) indices in the x, y, etc.  directions.
+```
+int bottom_range[] = {-1, imax+1, -1, 0};
+```
+Next, we need to outline the “elemental” into `laplacekernels.h`, and place the appropriate access objects - `ACC<double> &A`, in the kernel’s formal parameter list, and `(i,j)` are the stencil offsets in the X and Y directions respectively:
+```
+void set_zero(ACC<double> &A) {
+  A(0,0) = 0.0;
+}
+```
+The OPS parallel loop can now be written as follows:
+```
+ops_par_loop(set_zero, "set_zero", block, 2, bottom_range,
+      ops_arg_dat(d_A, 1, S2D_00, "double", OPS_WRITE));
+```
+The loop will execute `set_zero` at each mesh point defined in the iteration range, and write the dataset `d_A` with the 1-point stencil. The `ops_par_loop` implies that the order in which mesh points will be executed will not affect the end result (within machine precision).
+
+There are three more loops which set values to zero, they can be trivially replaced with the code above, only altering the iteration range. In the main while loop, the second simpler loop simply copies data from one array to another, this time on the interior of the domain:
+```
+int interior_range[] = {0,imax,0,jmax};
+ops_par_loop(copy, "copy", block, 2, interior_range,
+    ops_arg_dat(d_A,    1, S2D_00, "double", OPS_WRITE),
+    ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_READ));
+```
+And the corresponding outlined elemental kernel is as follows:
+```
+void copy(ACC<double> &A, const ACC<double> &Anew) {
+  A(0,0) = Anew(0,0);
+}
+```
+## Step 4 - Indexes and global constants
+There are two sets of boundary loops which use the loop variable j - this is a common technique to initialise data, such as coordinates `(x = i∗dx)`. OPS has a special argument `ops_arg_idx` which gives us a globally coherent (including over MPI) iteration index - between the bounds supplied in the iteration range.
+```
+ops_par_loop(left_bndcon, "left_bndcon", block, 2, left_range,
+      ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE),
+      ops_arg_idx());
+```
+And the corresponding outlined user kernel is as follows.  Observe the `idx` argument and the +1 offset due to the difference in indexing:
+```
+void left_bndcon(ACC<double> &A, const int *idx) {
+  A(0,0) = sin(pi * (idx[1]+1) / (jmax+1));
+}
+```
+This kernel also uses two variables,`jmax` and `pi` that do not depend on the iteration index - they are iteration space invariant.  OPS has two ways of supporting this:
+
+1) Global scope constants, through `ops_decl_const`, as done in this example: we need to move the declaration of the `imax`,`jmax` and `pi` variables to global scope (outside of main), and call the OPS API:
+```
+//declare and define global constants
+ops_decl_const("imax",1,"int",&imax);
+ops_decl_const("jmax",1,"int",&jmax);
+ops_decl_const("pi",1,"double",&pi);
+```
+These ariables do not need to be passed in to the elemental kernel, they are accessible in all elemental kernels.
+
+2) The other option is to explicitly pass it to the elemental kernel with `ops_arg_gbl`:  this is for scalars and small arrays that should not be in global scope.
+
+
+## Step 5 - Complex stencils and reductions
+There is only one loop left, which uses a 5 point stencil and a reduction.  It can be outlined as usual, and for the stencil, we will use `S2Dpt5`.
+```
+ops_par_loop(apply_stencil, "apply_stencil", block, 2, interior_range,
+        ops_arg_dat(d_A,    1, S2D_5pt, "double", OPS_READ),
+        ops_arg_dat(d_Anew, 1, S2D_00, "double", OPS_WRITE),
+        ops_arg_reduce(h_err, 1, "double", OPS_MAX))
+```
+And the corresponding outlined elemental kernel is as follows.  Observe the stencil offsets used to access the adjacent 4 points:
+```
+void apply_stencil(const ACC<double> &A, ACC<double> &Anew, double *error) {
+  Anew(0,0) = 0.25f * ( A(1,0) + A(-1,0)
+      + A(0,-1) + A(0,1));
+  *error = fmax( *error, fabs(Anew(0,0)-A(0,0)));
+}
+```
+The loop also has a special argument for the reduction, `ops_arg_reduce`.  As the first argument, it takes a reduction handle, which has to be defined separately:
+```
+ops_reduction h_err = ops_decl_reduction_handle(sizeof(double), "double", "error");
+``` 
+Reductions may be increment (`OPS_INC`), min (`OPS_MIN`) or max (`OPS_MAX`). The user kernel will have to perform the reduction operation, reducing the passed in value as well as the computed value.
+
+The result of the reduction can be queried from the handle as follows:
+```
+ ops_reduction_result(h_err, &error);
+```
+
+Multiple parallel loops may use the same handle, and their results will be combined, until the result is queried by the user.  Parallel loops that only have the reduction handle in common are semantically independent.
+
+## Step 6 - Handing it all to OPS
+
+We have now successfully converted all computations on the mesh to OPS parallel loops. In order for OPS to manage data and parallelisations better, we should let OPS allocate the datasets - instead of passing in the pointers to memory allocated by us, we just pass in NULL (`A` and `Anew`). Parallel I/O can be done using HDF5 - see the ops_hdf5.h header.
+
+All data and parallelisation is now handed to OPS. We can now also compile the developer MPI version of the code - see the Makefile, and try building `laplace2d_mpi`.
+
+## Step 7 - Code generation
+
+Now that the developer versions of our code work, it’s time to generate code.  On the console, type: 
+```
+$OPSINSTALLPATH/../ops_translator/c/ops.py laplace2d.cpp
+```
+We have provided a Makefile which can use several different compilers (intel, cray, pgi, clang), we suggest modifying it for your own applications. Try building CUDA, OpenMP, MPI+CUDA, MPI+OpenMP, and other versions of the code. You can take a look at the generated kernels for different parallelisations under the appropriate subfolders. 
+
+If you add the−`OPS_DIAGS=2` runtime flag, at the end of execution, OPS will report timings and achieved bandwidth for each of your kernels. For more options, see [Runtime Flags and Options](https://ops-dsl.readthedocs.io/en/markdowndocdev/devanapp.html#runtime-flags-and-options).
+
+
+## Code generated versions
+OPS will generate and compile a large number of different versions.
+* `laplace2d_dev_seq` and `laplace2d_dev_mpi` :  these do not use code generation, they are intended for development only
+* `laplace2d_seq` and `laplace2d_mpi` : baseline sequential and MPI implementations
+* `laplace2d_openmp` : baseline OpenMP implementation
+* `laplace2d_cuda`, `laplace2d_opencl`, `laplace2d_openacc` : implementations targeting GPUs 
+* `laplace2d_mpiinline` : optimised implementation with MPI+OpenMP
+* `laplace2d_tiled`: optimised implementation with OpenMP that improves spatial and temporal locality
+
+## Optimizations - general
+Try the following performance tuning options
+* `laplace2d_cuda`, `laplace2d_opencl` : you can set the `OPS_BLOCK_SIZE_X` and `OPS_BLOCK_SIZE_Y` runtime arguments to control thread block or work group sizes 
+* `laplace2d_mpi_cuda`, `laplace2d_mpi_openacc` : add the `-gpudirect` runtime flag to enable GPU Direct communications
+
+
+## Optimizations - tiling
+
+Tiling uses lazy execution: as parallel loops follow one another, they are not executed, but put in a queue, and only once some data needs to be returned to the user (e.g.  result of a reduction) do these loops have to be executed.
+
+With a chain of loops queued, OPS can analyse them together and come up with a tiled execution schedule.
+
+This works over MPI as well:  OPS extends the halo regions, and does one big halo exchange instead of several smaller ones. In the current `laplace2d` code, every stencil application loop is also doing a reduction, therefore only two loops are queued. Try modifying the code so the reduction only happens every 10 iterations ! On a Xeon E5-2650, one can get a 2.5x speedup.
+
+The following versions can be executed with the tiling optimzations.
+
+* `laplace2d_tiled`, `laplace2d_mpi_tiled` : add the `OPS_TILING` runtime flag, and move `-OPSDIAGS=3` to see the cache blocking tiling at work. For some applications, such as this one, the initial guess gives too large tiles, try setting `OPS_CACHE_SIZE` to a lower value (in MB, for L3 size).  Thread affinity control and using 1 process per socket isstrongly recommended.  E.g. `OMP_NUM_THREADS=20 numactl--cpunodebind=0 ./laplace2dtiled -OPSDIAGS=3 OPS_TILING OPS_CACHE_SIZE=5`. Over MPI, you will have to set `OPS_TILING_MAX_DEPTH` to extend halo regions.
+
+
+
+
+
+
diff --git a/doc/devdoc.md b/doc/devdoc.md
new file mode 100644
index 0000000000..9f8b4f6246
--- /dev/null
+++ b/doc/devdoc.md
@@ -0,0 +1,59 @@
+# Developer Guide
+Under construction.
+<!-- 
+## Code-generator
+### Frontend API parser
+### Target Parallel Templates
+### Elemental Kernel Transformations
+## Back-end Library
+### Sequential and multi-threaded CPU
+### MPI and Partitioning
+### HDF5
+### CUDA
+### Cache blocking tiling and comm-avoiding optimizations
+-->
+## Contributing
+
+To contribute to OPS please use the following steps :
+1. Clone the [OPS](https://github.com/OP-DSL/OPS) repository (on your local system).
+2. Create a new branch in your cloned repository
+3. Make changes / contributions in your new branch
+4. Submit your changes by creating a Pull Request to the `develop` branch of the OPS repository
+
+The contributions in the `develop` branch will be merged into the `master` branch as we create a new release.
+
+<!--
+## Git work flow for contribution
+To facilitate the concept of "Version" and "Release", we adopt the [Gitflow Workflow model](#https://nvie.com/posts/a-successful-git-branching-model/).
+### Overall work flow
+
+1. Create develop branch from main
+
+2. Create release branch from develop
+
+   After creating a release branch, only documentation and bug fixes will be added this branch.
+
+3. Create feature branches from develop
+
+4. Merge a feature branch into the develop branch once completed
+
+5. Merge release branch into develop and main once completed
+
+6. Create a hotfix branch from main if an issue is identified
+
+7. Merge a hotfix branch to both develop and main once fixed
+
+See also https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow.
+
+### A few issues
+Using the Gitflow model tends to produce a few long-live branches (e.g.,  feature), which may increase the risk of "conflicts" for intergration. To migrate this, we encourage the following practice
+
+* Try to create short-lived branches with a few small commites when possbile (e.g., a hotfix branch)
+* Once a branch properly merges or a feature finalised, delete the branch
+* A feature branch tends to be long-live, try to split a feature into "milestones" and merge into the develop branch when finishing each milestone.
+
+**The Gitflow tool will automatically delete a branch once it is finished.**
+### Gitflow tool
+
+see https://github.com/nvie/gitflow
+-->
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000000..39efe81875
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,29 @@
+.. Test documentation master file, created by
+   sphinx-quickstart on Thu Sep 23 09:45:16 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to OPS documentation!
+================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   introduction.md
+   installation.md
+   devanapp.md
+   opsapi.md
+   apps.md
+   perf.md
+   devdoc.md
+   pubs.md
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/doc/installation.md b/doc/installation.md
new file mode 100644
index 0000000000..cbc04f505a
--- /dev/null
+++ b/doc/installation.md
@@ -0,0 +1,151 @@
+# Getting Started
+
+**Note: The current CMakefile and relevant instructions are mainly tested on linux-based systems including Windows Subsystem for Linux**
+
+## Dependencies
+
+The following prerequisites and dependencies are required for building OPS. Building each of the **backends** are optional and depends on the hardware and/or capabilities you will be targeting. 
+
+  **CMake**
+
+CMake 3.18 or newer is required for using the CMake building system. If the latest version is not installed/shipped by default, it can be downloaded from https://cmake.org/download/, e.g., using the following script.
+  ```bash {r}
+  version=3.19.0
+  wget https://github.com/Kitware/CMake/releases/download/v$version/cmake-$version-Linux-x86_64.sh
+  # Assume that CMake is going to be installed at /usr/local/cmake
+  cmake_dir=/usr/local/cmake
+  # sudo is not necessary for directories in user space.
+  sudo mkdir $cmake_dir
+  sudo sh ./cmake-$version-Linux-x86_64.sh --prefix=$cmake_dir  --skip-license
+  sudo ln -s $cmake_dir/bin/cmake /usr/local/bin/cmake
+  ```
+
+ **Python2**
+
+Python2 is required by the OPS Python translator. The CMake build system will try to identify it automatically. However, the process can fail sometime (e.g., if there are both Python2 and Python3 installed). If this happens, the path to Python2 can be specified manually by using `-DPython2_EXECUTABLE` when invoking CMake
+
+ **HDF5**
+
+[HDF5](https://www.hdfgroup.org/solutions/hdf5) is required for parts of IO functionalities. The CMake build system **uses the parallel version by default** even for sequential codes, and automatically identify the library. If the automatic process fails, the path to the parallel HDF5 library can be specified by using `-DHDF5_ROOT`.
+
+ **CUDA Backend**
+ 
+The [CUDA](https://developer.nvidia.com/cuda-downloads) backend targets NVIDIA GPUs with a compute capability of 3.0 or greater. The CMake build system will detect the tookit automatically. If the automatic process fails, the build system will compile the library without the CUDA support.  Please use `-DCUDA_TOOLKIT_ROOT_DIR` to manually specify the path.
+
+**HIP Backend**
+
+The HIP backend targets AMD GPUs and NVIDIA GPUs which are supported by HIP - either through its CUDA support or the [ROCm](https://rocmdocs.amd.com/en/latest/) stack (tested with >=3.9). 
+
+**SYCL Backend**
+
+The [SYCL](https://www.khronos.org/sycl/) backend is currently in development and only working without MPI. It has been tested with Intel OneAPI (>=2021.1), Intel's public LLVM version, and hipSYCL (>=0.9.1), and runs on Intel CPUs and GPUs through Intel's OpenCL and Level Zero, NVIDIA and AMD GPUs both with the LLVM fork as well as hipSYCL. hipSYCL's OpenMP support covers most CPU architectures too.
+
+**Tridiagonal Solver Backend**
+
+To use the tridiagonal solver OPS API in applications and build example applications such as `adi`, `adi_burger` and `adi_burger_3D` the open source tridiagonal solver (scalar) library needs to be cloned and built from the [Tridsolver repository](https://github.com/OP-DSL/tridsolver). 
+```bash
+git clone https://github.com/OP-DSL/tridsolver.git
+```
+Details on building scalar tridiagonal solver library can be found in the [README](https://github.com/OP-DSL/tridsolver/blob/master/scalar/README) file located at the appropriate subdirectory.
+
+## Obtaining OPS
+The latest OPS source code can be obtained by cloning the [OPS repository](https://github.com/OP-DSL/OPS) using
+```bash
+git clone https://github.com/OP-DSL/OPS.git
+```
+    
+## Build OPS
+### Using cmake
+#### Build library and example applications together
+
+  Create a build directory, and run CMake (version 3.18 or newer)
+  ```bash
+  mkdir build
+  cd build
+  # Please see below for CMake options
+  cmake ${PATH_TO_OPS} -DBUILD_OPS_APPS=ON -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL -DGPU_NUMBER=1
+  make # IEEE=1 enable IEEE flags in compiler
+  make install # sudo is needed if a directory like /usr/local/ is chosen.
+  ```
+After installation, the library and the python translator can be found at the direcory specified by `CMAKE_INSTALL_PREFIX`, together with the executable files for applications at `APP_INSTALL_DIR`.
+
+####  Build library and example applications separately
+
+In this mode, the library can be firstly built and installed as
+
+```bash
+mkdir build
+cd build
+# Please see below for CMake options
+cmake ${PATH_TO_OPS}   -DCMAKE_INSTALL_PREFIX=$HOME/OPS-INSTALL
+make # IEEE=1 enable IEEE flags in compiler
+make install # sudo is needed if a system direction is chosen,
+```
+then the application can be built as
+
+```bash
+mkdir appbuild
+cd appbuild
+# Please see below for CMake options
+cmake ${PATH_TO_APPS} -DOPS_INSTALL_DIR=$HOME/OPS-INSTALL -DOPS_TEST=ON -DAPP_INSTALL_DIR=$HOME/OPS-APP -DGPU_NUMBER=1
+make # IEEE=1 this option is important for applications to get accurate results
+```
+<!-- #### Tests
+
+A few tasks for testing codes can be run by
+```bash
+make test
+```
+The current tests are mainly based on the applications.
+-->
+
+#### cmake options
+
+  * `-DCMAKE_BUILD_TYPE=Release` - enable optimizations
+  * `-DBUILD_OPS_APPS=ON` - build example applications (Library CMake only)
+  * `-DOPS_TEST=ON` - enable the tests
+  * `-DCMAKE_INSTALL_PREFIX=` - specify the installation direction for the library (`/usr/local` by default, Library CMake only)
+  * `-DAPP_INSTALL_DIR=` - specify the installation direction for the applications (`$HOME/OPS-APPS` by default)
+  * `-DGPU_NUMBER=` - specify the number of GPUs used in the tests
+  * `-DOPS_INSTALL_DIR=` - specify where the OPS library is installed (Application CMake only, see [here](#build-the-library-and-example-applications-separately))
+  * `-DOPS_VERBOSE_WARNING=ON` - show verbose output during building process
+  <!-- * `-DHDF5_PREFER_PARALLEL=ON` - build using parallel HDF5, rather than serial HDF5 libraries -->
+  <!-- * `-DBUILD_OPS_FROTRAN=ON` - enable building OPS Fortran libraries. -->
+
+<!-- 1. Set up environmental variables:
+* `CUDA_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications, only needed if CUDA cannot be found in standard locations, or to enable OpenCL)
+* `MPI_HOME` - Installation directory of MPI (to build MPI based distributed memory libs and applications) only needed if MPI not installed in standard locations
+* `HDF5_ROOT` - Installation directory of HDF5 (to support HDF5 based File I/O) if HDF5 not installed in standard location -->
+
+### Using Makefiles
+#### Set up environmental variables:
+
+  * `OPS_COMPILER` - compiler to be used (Currently supports Intel, PGI and Cray compilers, but others can be easily incorporated by extending the Makefiles used in step 2 and 3)
+  * `OPS_INSTALL_PATH` - Installation directory of OPS/ops
+  * `CUDA_INSTALL_PATH` - Installation directory of CUDA, usually `/usr/local/cuda` (to build CUDA libs and applications)
+  * `OPENCL_INSTALL_PATH` - Installation directory of OpenCL, usually `/usr/local/cuda` for NVIDIA OpenCL implementation (to build OpenCL libs and applications)
+  * `MPI_INSTALL_PATH` - Installation directory of MPI (to build MPI based distributed memory libs and applications)
+  * `HDF5_INSTALL_PATH` - Installation directory of HDF5 (to support HDF5 based File I/O)
+
+See example scripts (e.g. `source_intel`, `source_pgi_15.10`, `source_cray`) under `OPS/ops/scripts` that sets up the environment for building with various compilers (Intel, PGI, Cray).
+
+#### Build back-end library
+For C/C++ back-end use Makefile under `OPS/ops/c` (modify Makefile if required). The libraries will be built in `OPS/ops/c/lib`
+```bash
+cd $OPS_INSTALL_PATH/c
+make
+```
+For Fortran back-end use Makefile under `OPS/ops/fortran` (modify Makefile if required). The libraries will be built in `OPS/ops/fortran/lib`
+```bash
+cd $OPS_INSTALL_PATH/fortran
+make
+```
+#### Build exampe applications
+For example to build CloverLeaf_3D under `OPS/apps/c/CloverLeaf_3D`
+```bash  
+cd ../apps/c/Cloverleaf_3D/
+make
+```  
+<!---#### Makefile options -->
+
+
diff --git a/doc/introduction.md b/doc/introduction.md
new file mode 100644
index 0000000000..b458afae3d
--- /dev/null
+++ b/doc/introduction.md
@@ -0,0 +1,39 @@
+# Introduction
+
+## Overview
+
+[OPS](https://github.com/OP-DSL/OPS) (Oxford Parallel library for Structured mesh solvers) is a high-level embedded domain specific language (eDSL) for writing **multi-block structured mesh** algorithms, and the corresponding software library and code translation tools to enable automatic parallelisation on multi-core and many-core architectures. Multi-block structured meshes consists of an unstructured collection of structured meshes. The OPS API is embedded in C/C++ and Fortran. 
+
+The current OPS eDSL supports generating code targeting multi-core/multi-threaded CPUs, many-core GPUs and clusters of CPUs and GPUs using a range of paralleization models including SIMD vectorization, OpenMP, CUDA, OpenCL, OpenACC and their combinations with MPI. There is also experimental support for paralleizations using SYCL and AMD HIP. Various optimizations for each paralleization can be generated automatically, including cache blocking tiling to improve locality. The OPS API and library can also be used to solve multi-dimensional tridiagonal systems using the [tridsolver](https://github.com/OP-DSL/tridsolver) library.
+
+These pages provide detailed documentation on using OPS, including an installation guide, developing and running OPS applications, the OPS API, developer documentation and performance tuning.
+
+
+## Licencing
+OPS is released as an open-source project under the BSD 3-Clause License. See the [LICENSE](https://github.com/OP-DSL/OPS/blob/master/LICENSE) file for more information.
+
+## Citing
+To cite OPS, please reference the following paper:
+
+[I. Z. Reguly, G. R. Mudalige and M. B. Giles, Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS, in IEEE Transactions on Parallel and Distributed Systems, vol. 29, no. 4, pp. 873-886, 1 April 2018, doi: 10.1109/TPDS.2017.2778161.](https://ieeexplore.ieee.org/abstract/document/8121995)
+
+```
+@ARTICLE{Reguly_et_al_2018,
+  author={Reguly, István Z. and Mudalige, Gihan R. and Giles, Michael B.},
+  journal={IEEE Transactions on Parallel and Distributed Systems}, 
+  title={Loop Tiling in Large-Scale Stencil Codes at Run-Time with OPS}, 
+  year={2018},
+  volume={29},
+  number={4},
+  pages={873-886},
+  doi={10.1109/TPDS.2017.2778161}}
+```
+Full list of publications from the OPS project can be found in the [Publications](https://ops-dsl.readthedocs.io/en/markdowndocdev/pubs.html) section.
+
+## Support
+The preferred method of reporting bugs and issues with OPS is to submit an issue via the repository’s issue tracker. Users can also email the authors directly by  contacting the [OP-DSL team](https://op-dsl.github.io/about.html). 
+
+## Funding
+The development of OPS was in part supported by the UK Engineering and Physical Sciences Research Council (EPSRC) grants [EP/K038494/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/K038494/1) (“Future-proof massively-parallel execution of multi-block applications”), [EP/J010553/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/J010553/1) (“Software for Emerging Architectures - ASEArch"), The UK Turbulence Consortium grant [EP/T026170/1](https://gow.epsrc.ukri.org/NGBOViewGrant.aspx?GrantRef=EP/T026170/1), The Janos Bolyai Research Scholarship of the Hungarian Academy of Sciences, the Royal Society through their Industry Fellowship Scheme (INF/R1/180012), and the Thematic Research Cooperation Establishing Innovative Informatic and Info-communication Solutions Project, which has been supported by the European Union and co-financed by the European Social Fund under grant number EFOP-3.6.2-16-2017-00013. Research funding support was also provided by the UK AWE under grants CDK0660 ("The Production of Predictive Models for Future Computing Requirements"), CDK0724 ("AWE Technical Outreach Programme"), AWE grant for "High-level Abstractions for Performance, Portability and Continuity of Scientific Software on Future Computing Systems" and the Numerical Algorithms Group [NAG](https://www.nag.com/).
+
+Hardware resources for development and testing provided by the Oak Ridge Leadership Computing Facility at the Oak Ridge National Laboratory, which is supported by the Office of Science of the U.S. Department of Energy under Contract No. DE-AC05-00OR22725, the [ARCHER](http://www.archer.ac.uk) and [ARCHER2](https://www.archer2.ac.uk/) UK National Supercomputing Service, [University of Oxford Advanced Research Computing (ARC) facility](http://dx.doi.org/10.5281/zenodo.22558) and through hardware donations and access provided by NVIDIA and Intel.
diff --git a/doc/keyconcept.md b/doc/keyconcept.md
new file mode 100644
index 0000000000..bd26fc25b5
--- /dev/null
+++ b/doc/keyconcept.md
@@ -0,0 +1,102 @@
+# Key concepts and structure
+
+An OPS application can generally be divided into two key parts:
+initialisation and parallel execution. During the initialisation phase,
+one or more blocks (ops_block) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a
+block, and have a specific size (in each dimension of the block), which
+may be slightly different across different datasets (e.g. staggered
+grids), in some directions they may be degenerate (a size of 1), or they
+can represent data associated with different multigrid levels (where
+their size if a multiple or a fraction of other datasets). Datasets can
+be declared with empty (NULL) pointers, then OPS will allocate the
+appropriate amount of memory, may be passed non-NULL pointers (currently
+only supported in non-MPI environments), in which case OPS will assume
+the memory is large enough for the data and the block halo, and there
+are HDF5 dataset declaration routines which allow the distributed
+reading of datasets from HDF5 files. The concept of blocks is necessary
+to group datasets together, as in a multi-block problem, in a
+distributed memory environment, OPS needs to be able to determine how to
+decompose the problem.
+
+The initialisation phase usually also consists of defining the stencils
+to be used later on (though they can be defined later as well), which
+describe the data access patterns used in parallel loops. Stencils are
+always relative to the "current" point; e.g. if at iteration $(i,j)$, we
+wish to access $(i{-}1,j)$ and $(i,j)$, then the stencil will have two
+points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in
+one of the dimensions the dataset's size is 1), as well as for
+multigrid, there are special strided, restriction, and prolongation
+stencils: they differ from normal stencils in that as one steps through
+a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a
+degenerate dataset called xcoords, size $(N,1)$, then we will need a
+stencil with stride $(1,0)$ to access it in a regular 2D loop.
+
+Finally, the initialisation phase may declare a number of global
+constants - these are variables in global scope that can be accessed
+from within user kernels, without having to pass them in explicitly.
+These may be scalars or small arrays, generally for values that do not
+change during execution, though they may be updated during execution
+with repeated calls to `ops_decl_const`.
+
+The initialisation phase is terminated by a call to `ops_partition`.
+
+The bulk of the application consists of parallel loops, implemented
+using calls to `ops_par_loop`. These constructs work with datasets,
+passed through the opaque `ops_dat` handles declared during the
+initialisation phase. The iterations of parallel loops are semantically
+independent, and it is the responsibility of the user to enforce this:
+the order in which iterations are executed cannot affect the result
+(within the limits of floating point precision). Parallel loops are
+defined on a block, with a prescribed iteration range that is always
+defined from the perspective of the dataset written/modified (the sizes
+of datasets, particularly in multigrid situations, may be very
+different). Datasets are passed in using `ops_arg_dat`, and during
+execution, values at the current grid point will be passed to the user
+kernel. These values are passed wrapped in a templated `ACC<>` object
+(templated on the type of the data), whose parentheses operator is
+overloaded, which the user must use to specify the relative offset to
+access the grid point's neighbours (which accesses have to match the the
+declared stencil). Datasets written may only be accessed with a
+one-point, zero-offset stencil (otherwise the parallel semantics may be
+violated).
+
+Other than datasets, one can pass in read-only scalars or small arrays
+that are iteration space invariant with `ops_arg_gbl` (typically
+weights, $\delta t$, etc. which may be different in different loops).
+The current iteration index can also be passed in with `ops_arg_idx`,
+which will pass a globally consistent index to the user kernel (i.e.
+also under MPI).
+
+Reductions in loops are done using the ops_arg_reduce argument, which
+takes a reduction handle as an argument. The result of the reduction can
+then be acquired using a separate call to `ops_reduction_result`. The
+semantics are the following: a reduction handle after it was declared is
+in an "uninitialised" state. The first time it is used as an argument to
+a loop, its type is determined (increment/min/max), and is initialised
+appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in
+parallel loops are combined together, up until the point, where the
+result is acquired using `ops_reduction_result`, which then sets it back
+to an uninitialised state. This also implies, that different parallel
+loops, which all use the same reduction handle, but are otherwise
+independent, are independent and their partial reduction results can be
+combined together associatively and commutatively.
+
+OPS takes responsibility for all data, its movement and the execution of
+parallel loops. With different execution hardware and optimisations,
+this means OPS will re-organise data as well as execution (potentially
+across different loops), and therefore any data accesses or manipulation
+may only be done through the OPS API.
+
+This restriction is exploited by a lazy execution mechanism in OPS. The
+idea is that OPS API calls that do not return a result can be not
+executed immediately, rather queued, and once an API call requires
+returning some data, operations in the queue are executed, and the
+result is returned. This allows OPS to analyse and optimise operations
+in the queue together. This mechanism is fully automated by OPS, and is
+used with the various \_tiled executables. For more information on how
+to use this mechanism for improving CPU performance, see Section
+[\[sec:tiling\]](#sec:tiling){reference-type="ref"
+reference="sec:tiling"}. Some API calls triggering the execution of
+queued operations include ops_reduction_result, and the functions in the
+data access API.
\ No newline at end of file
diff --git a/doc/ops/Doxyfile b/doc/ops/Doxyfile
index 32f7c733c8..a116d6466f 100644
--- a/doc/ops/Doxyfile
+++ b/doc/ops/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.11
+# Doxyfile 1.8.16
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -17,11 +17,11 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = "OPS"
+PROJECT_NAME           = OPS
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -179,6 +187,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = YES
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -226,7 +244,12 @@ TAB_SIZE               = 4
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
 # "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
@@ -264,17 +287,26 @@ OPTIMIZE_FOR_FORTRAN   = YES
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
 # language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is
+# Fortran), use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
@@ -285,7 +317,7 @@ EXTENSION_MAPPING      = inc=Fortran
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -293,6 +325,15 @@ EXTENSION_MAPPING      = inc=Fortran
 
 MARKDOWN_SUPPORT       = YES
 
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -309,7 +350,7 @@ AUTOLINK_SUPPORT       = YES
 # diagrams that involve STL classes more complete and accurate.
 # The default value is: NO.
 
-BUILTIN_STL_SUPPORT    = NO
+BUILTIN_STL_SUPPORT    = YES
 
 # If you use Microsoft's C++/CLI language, you should set this option to YES to
 # enable parsing support.
@@ -318,7 +359,7 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -424,6 +465,12 @@ EXTRACT_ALL            = YES
 
 EXTRACT_PRIVATE        = NO
 
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
@@ -502,7 +549,7 @@ INTERNAL_DOCS          = NO
 # names in lower-case letters. If set to YES, upper-case letters are also
 # allowed. This is useful if you have classes or files whose names only differ
 # in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# (including Cygwin) ands Mac users are advised to set this option to NO.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
@@ -512,7 +559,7 @@ CASE_SENSE_NAMES       = YES
 # scope will be hidden.
 # The default value is: NO.
 
-HIDE_SCOPE_NAMES       = NO
+HIDE_SCOPE_NAMES       = YES
 
 # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
 # append additional text to a page's title, such as Class Reference. If set to
@@ -689,7 +736,7 @@ LAYOUT_FILE            =
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -734,7 +781,8 @@ WARN_IF_DOC_ERROR      = YES
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
 # value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
@@ -771,12 +819,16 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = ../README.md ../ops/c/src ../ops/c/include ../ops/fortran/src ../ops/fortran/include
+INPUT                  = ../README.md \
+                         ../ops/c/src \
+                         ../ops/c/include \
+                         ../ops/fortran/src \
+                         ../ops/fortran/include
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
 # possible encodings.
 # The default value is: UTF-8.
 
@@ -793,8 +845,8 @@ INPUT_ENCODING         = UTF-8
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
-# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          =
 
@@ -949,7 +1001,7 @@ INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -981,12 +1033,12 @@ SOURCE_TOOLTIPS        = YES
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1014,7 +1066,7 @@ VERBATIM_HEADERS       = YES
 # rich C++ code for which doxygen's built-in parser lacks the necessary type
 # information.
 # Note: The availability of this option depends on whether or not doxygen was
-# generated with the -Duse-libclang=ON option for CMake.
+# generated with the -Duse_libclang=ON option for CMake.
 # The default value is: NO.
 
 CLANG_ASSISTED_PARSING = NO
@@ -1027,6 +1079,16 @@ CLANG_ASSISTED_PARSING = NO
 
 CLANG_OPTIONS          =
 
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1145,7 +1207,7 @@ HTML_EXTRA_FILES       =
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1181,6 +1243,17 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 HTML_TIMESTAMP         = NO
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1204,13 +1277,13 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# environment (see: https://developer.apple.com/xcode/), introduced with OSX
+# 10.5 (Leopard). To create a documentation set, doxygen will generate a
 # Makefile in the HTML output directory. Running make will produce the docset in
 # that directory and running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1249,7 +1322,7 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
 # Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
@@ -1325,7 +1398,7 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1333,7 +1406,7 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
 # folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
@@ -1342,7 +1415,7 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1350,7 +1423,7 @@ QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1358,7 +1431,7 @@ QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
@@ -1416,7 +1489,7 @@ DISABLE_INDEX          = NO
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-GENERATE_TREEVIEW      = NO
+GENERATE_TREEVIEW      = YES
 
 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
 # doxygen will group on one line in the generated HTML documentation.
@@ -1451,7 +1524,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
@@ -1463,7 +1536,7 @@ FORMULA_FONTSIZE       = 10
 FORMULA_TRANSPARENT    = YES
 
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side Javascript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1490,8 +1563,8 @@ MATHJAX_FORMAT         = NativeMML
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
@@ -1501,7 +1574,8 @@ MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
+MATHJAX_EXTENSIONS     = TeX/AMSmath \
+                         TeX/AMSsymbols
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -1552,7 +1626,7 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see: https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1565,7 +1639,7 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
@@ -1617,21 +1691,35 @@ LATEX_OUTPUT           = ops/latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1752,7 +1840,7 @@ LATEX_SOURCE_CODE      = NO
 
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1766,6 +1854,14 @@ LATEX_BIB_STYLE        = plain
 
 LATEX_TIMESTAMP        = NO
 
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1805,9 +1901,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1816,8 +1912,8 @@ RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
@@ -1903,6 +1999,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1935,9 +2038,9 @@ DOCBOOK_PROGRAMLISTING = NO
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -2037,7 +2140,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS
+PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2104,12 +2207,6 @@ EXTERNAL_GROUPS        = YES
 
 EXTERNAL_PAGES         = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
@@ -2123,15 +2220,6 @@ PERL_PATH              = /usr/bin/perl
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
@@ -2150,7 +2238,7 @@ HIDE_UNDOC_RELATIONS   = YES
 # http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
-# The default value is: YES.
+# The default value is: NO.
 
 HAVE_DOT               = YES
 
@@ -2306,9 +2394,7 @@ DIRECTORY_GRAPH        = YES
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
-# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
-# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
-# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
 # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
 # png:gdiplus:gdiplus.
 # The default value is: png.
@@ -2361,6 +2447,11 @@ DIAFILE_DIRS           =
 
 PLANTUML_JAR_PATH      =
 
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
diff --git a/doc/opsapi.md b/doc/opsapi.md
new file mode 100644
index 0000000000..1562380e12
--- /dev/null
+++ b/doc/opsapi.md
@@ -0,0 +1,851 @@
+# OPS API
+
+## Overview
+
+The key characteristic of structured mesh applications is the implicit connectivity between neighboring mesh elements (such as vertices, cells). The main idea is that operations involve looping over a "rectangular" multi-dimensional set of mesh points using one or more "stencils" to access data. In multi-block meshes, we have several structured blocks.  The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e.~an $i,j$ face of one block may correspond to the $j,k$ face of another block.  This is awkward and hard to handle simply.
+
+## Key Concepts and Structure
+
+The OPS API allows to declare a computation over such multi-block structured meshes. An OPS application can generally be declared in two key parts: (1) initialisation and (2) iteration over the mesh (carried out as a parallel loop). During the initialisation phase, one or more blocks (we call these `ops_block`s) are defined: these only have a dimensionality (i.e. 1D, 2D, etc.), and serve to group datasets together. Datasets are defined on a block, and have a specific size (in each dimension of the block), which may be slightly different across different datasets (e.g. staggered grids), in some directions they may be degenerate (a size of 1), or they can represent data associated with different multigrid levels (where their size if a multiple or a fraction of other datasets). Datasets can be declared with empty (NULL) pointers, then OPS will allocate the appropriate amount of memory, may be passed non-NULL pointers (currently only supported in non-MPI environments), in which case OPS will assume the memory is large enough for the data and the block halo, and there are HDF5 dataset declaration routines which allow the distributed reading of datasets from HDF5 files. The concept of blocks is necessary to group datasets together, as in a multi-block problem, in a distributed memory environment, OPS needs to be able to determine how to
+decompose the problem.
+
+The initialisation phase usually also consists of defining the stencils to be used later on (though they can be defined later as well), which describe the data access patterns used in parallel loops. Stencils are always relative to the "current" point; e.g. if at iteration $(i,j)$, we wish to access $(i-1,j)$ and $(i,j)$, then the stencil will have two points: $\{(-1, 0), (0, 0)\}$. To support degenerate datasets (where in one of the dimensions the dataset's size is 1), as well as for multigrid, there are special strided, restriction, and prolongation stencils: they differ from normal stencils in that as one steps through a grid in a parallel loop, the stepping is done with a non-unit stride
+for these datasets. For example, in a 2D problem, if we have a degenerate dataset called xcoords, size $(N,1)$, then we will need a stencil with stride $(1,0)$ to access it in a regular 2D loop.
+
+Finally, the initialisation phase may declare a number of global constants - these are variables in global scope that can be accessed from within elemental kernels, without having to pass them in explicitly. These may be scalars or small arrays, generally for values that do not change during execution, though they may be updated during execution
+with repeated calls to `ops_decl_const`.
+
+The initialisation phase is terminated by a call to `ops_partition`.
+
+The bulk of the application consists of parallel loops, implemented using calls to `ops_par_loop`. These constructs work with datasets, passed through the opaque `ops_dat` handles declared during the initialisation phase. The iterations of parallel loops are semantically independent, and it is the responsibility of the user to enforce this:
+the order in which iterations are executed cannot affect the result (within the limits of floating point precision). Parallel loops are defined on a block, with a prescribed iteration range that is always defined from the perspective of the dataset written/modified (the sizes of datasets, particularly in multigrid situations, may be very
+different). Datasets are passed in using `ops_arg_dat`, and during execution, values at the current grid point will be passed to the user kernel. These values are passed wrapped in a templated `ACC<>` object (templated on the type of the data), whose parentheses operator is overloaded, which the user must use to specify the relative offset to
+access the grid point's neighbours (which accesses have to match the the declared stencil). Datasets written may only be accessed with a one-point, zero-offset stencil (otherwise the parallel semantics may be violated).
+
+Other than datasets, one can pass in read-only scalars or small arrays that are iteration space invariant with `ops_arg_gbl` (typically weights, $\delta t$, etc. which may be different in different loops). The current iteration index can also be passed in with `ops_arg_idx`, which will pass a globally consistent index to the user kernel (i.e.
+also under MPI).
+
+Reductions in loops are done using the `ops_arg_reduce` argument, which takes a reduction handle as an argument. The result of the reduction can then be acquired using a separate call to `ops_reduction_result`. The semantics are the following: a reduction handle after it was declared is in an "uninitialised" state. The first time it is used as an argument to a loop, its type is determined (increment/min/max), and is initialised appropriately $(0,\infty,-\infty)$, and subsequent uses of the handle in parallel loops are combined together, up until the point, where the result is acquired using `ops_reduction_result`, which then sets it back to an uninitialised state. This also implies, that different parallel loops, which all use the same reduction handle, but are otherwise independent, are independent and their partial reduction results can be combined together associatively and commutatively.
+
+OPS takes responsibility for all data, its movement and the execution of parallel loops. With different execution hardware and optimisations, this means OPS will **re-organise** data as well as execution (potentially across different loops), and therefore **data accesses or manipulation should only be done through the OPS API**. There is an external data access API that allows access to the data stored by OPS which in turn allows interfacing with external libraries.
+
+This restriction is exploited by a lazy execution mechanism in OPS. The idea is that OPS API calls that do not return a result need not be executed immediately, rather queued, and once an API call requires returning some data, operations in the queue are executed, and the result is returned. This allows OPS to analyse and optimise operations
+in the queue together. This mechanism is fully automated by OPS, and is used with the various `_tiled` executables. For more information on how to use this mechanism for improving CPU performance, see Section on Tiling. Some API calls triggering the execution of queued operations include `ops_reduction_result`, and the functions in the
+data access API.
+
+To further clarify some of the important issues encountered when designing the OPS API, we note here some needs connected with a 3D application:
+
+* When looping over the interior with loop indices $i,j,k$, often there are 1D arrays which are referenced using just one of the indices.
+* To implement boundary conditions, we often loop over a 2D face, accessing both the 3D dataset and data from a 2D dataset.
+* To implement periodic boundary conditions using dummy "halo" points, we sometimes have to copy one plane of boundary data to another.  e.g. if the first dimension has size $I$ then we might copy the plane $i=I-2$ to plane $i=0$, and plane $i=1$ to plane $i=I-1$.
+* In multigrid, we are working with two grids with one having twice as many points as the other in each direction. To handle this we require a stencil with a non-unit stride.
+* In multi-block grids, we have several structured blocks. The connectivity between the faces of different blocks can be quite complex, and in particular they may not be oriented in the same way, i.e. an $i,j$ face of one block may correspond to the $j,k$ face of another block.
+
+OPS handle all of these different requirements through stencil definitions.
+
+## OPS C and C++ API
+
+Both C and C++ styles API are provided for utilizing the capabilities provided by the OPS library. They are essentially the same although there are minor differences in syntax. The C++ API is mainly designed for data abstraction, which  therefore provides better data encapsulation and the support of multiple instances and threading (OpenMP currently). In the following both C style routines and C++ class and methods will be introduced according to their functionality with a notice (C) or (C++). If there is no such notice, the routine either applies to both or might not provided by the C++ API.
+
+To enable the C++ API, a compiler directive ``OPS_CPP_API`` is required.
+
+### Initialisation and termination routines
+#### C Style
+##### ops_init
+
+__void ops_init(int argc, char** argv, int diags_level)__
+
+This routine must be called before all other OPS routines
+
+| Arguments      | Description |
+| ----------- | ----------- |
+| argc, argv      | the usual command line arguments      |
+| diags_level   |  an integer which defines the level of debugging diagnostics and reporting to be performed |
+
+Currently, higher diags_levels does the following checks
+
+`diags_level` $=$ 1 : no diagnostics, default to achieve best runtime
+performance.
+
+`diags_level` $>$ 1 : print block decomposition and `ops_par_loop`
+timing breakdown.
+
+`diags_level` $>$ 4 : print intra-block halo buffer allocation feedback
+(for OPS internal development only)
+
+`diags_level` $>$ 5 : check if intra-block halo MPI sends depth match
+MPI receives depth (for OPS internal development only)
+
+#### ops_exit
+
+__void ops_exit()__
+
+This routine must be called last to cleanly terminate the OPS computation.
+#### C++ style
+
+With the C++ style APIs, all data structures (block, data and stencils etc ) are encapsulated into a class  ``OPS_instance``. Thus, we can allocate multiple instances of ``OPS_instance`` by using the class constructor, for example,
+
+```c++
+// Allocate an instance
+OPS_instance *instance = new OPS_instance(argc,argv,1,ss);
+```
+
+where the meaning of arguments are same to the C API, while the extra argument (i.e., ss) is for accpeting the messages.
+
+An explicit termination is not needed for the C++ API, although we need to "delete" the instance in if it is allocated through pointer, i.e.,
+```C++
+delete instance;
+```
+
+### Declaration routines
+
+#### Block
+##### ops_decl_block (C)
+
+__ops_block ops_decl_block(int dims, char *name)__
+
+This routine defines a structured grid block.
+| Arguments      | Description |
+| ----------- | ----------- |
+| dims    | dimension of the block    |
+| name  |  a name used for output diagnostics |
+
+##### OPS_instance::decl_block (C++)
+
+A method of the OPS_instance class for declaring a block, which accepts same arguments with the C style function. A OPS_instance object should be constructed before this. The method returns a pointer to a ops_block type variable, where ops_block is an alias to a pointer type of ops_block_core. An example is
+
+```C++
+ops_block grid2D = instance->decl_block(2, "grid2D");
+```
+
+##### ops_decl_block_hdf5 (C)
+
+__ops_block ops_decl_block_hdf5(int dims, char *name, char *file)__
+
+This routine reads the details of a structured grid block from a named HDF5 file
+
+| Arguments      | Description |
+| ----------- | ----------- |
+| dims    | dimension of the block    |
+| name  |  a name used for output diagnostics |
+| file |hdf5 file to read and obtain the block information from|
+
+Although this routine does not read in any extra information about the
+block from the named HDF5 file than what is already specified in the
+arguments, it is included here for error checking (e.g. check if blocks
+defined in an HDF5 file is matching with the declared arguments in an
+application) and completeness.
+
+#### Dat (ops_cat_core)
+##### ops_decl_dat (C)
+
+__ops_dat ops_decl_dat(ops block block, int dim, int *size, int *base, int *dm, int *d p, T *data, char *type, char *name)__
+
+This routine defines a dataset.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|block   |      structured block |
+|dim     |      dimension of dataset (number of items per grid element) |
+|size    |  size in each dimension of the block |
+|base    |  base indices in each dimension of the block |
+|d_m    |  padding from the face in the negative direction for each dimension (used for block halo) |
+|d_p    |  padding from the face in the positive direction for each dimension (used for block halo) |
+|data    |     input data of type *T* |
+|type     |     the name of type used for output diagnostics (e.g. ``double``,``float``)|
+|name     |     a name used for output diagnostics|
+
+The `size` allows to declare different sized data arrays on a given
+`block`. `d_m` and `d_p` are depth of the "block halos" that are used to
+indicate the offset from the edge of a block (in both the negative and
+positive directions of each dimension).
+
+##### ops_block_core::decl_dat (C++)
+The method ops_block_core::decl_dat is used to define a ops_dat object, which accepts almost same arguments with the C conterpart where the block argument is not necessary, e.g.,
+```C++
+//declare ops_dat with dim = 2
+ops_dat dat0    = grid2D->decl_dat(2, size, base, d_m, d_p, temp, "double", "dat0");
+ops_dat dat1    = grid2D->decl_dat(2, size, base, d_m, d_p, temp, "double", "dat1");
+```
+where grid2D is a ops_block_core object which shall be defined before this.
+
+##### ops_decl_dat_hdf5 (C)
+
+__ops_dat ops_decl_dat_hdf5(ops_block block, int dim, char *type, char *name, char *file)__
+
+This routine defines a dataset to be read in from a named hdf5 file
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|block  |   structured block|
+|dim     |  dimension of dataset (number of items per grid element)|
+type    |  the name of type used for output diagnostics (e.g. ``double``,``float``)|
+|name   |   name of the dat used for output diagnostics|
+|file   |   hdf5 file to read and obtain the data from|
+
+#### Global constant
+##### ops_decl_const (C)
+
+__void ops_decl_const(char const * name, int dim, char const * type, T * data )__
+
+This routine defines a global constant: a variable in global scope. Global constants need to be declared upfront
+ so that they can be correctly handled for different parallelization. For e.g CUDA on GPUs. Once defined
+ they remain unchanged throughout the program, unless changed by a call to ops_update_const(..). The ``name'' and``type''
+ parameters **must** be string literals since they are used in the code generation step
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|name |         a name used to identify the constant |
+|dim |           dimension of dataset (number of items per element) |
+|type |          the name of type used for output diagnostics (e.g. ``double``, ``float``) |
+|data |          pointer to input data of type *T* |
+
+##### OPS_instance::decl_const (C++)
+
+The method accepts same arguments with its C counterpart.
+
+#### Halo definition
+##### ops_decl_halo (C)
+
+__ops_halo ops_decl_halo(ops_dat from, ops_dat to, int *iter_size, int* from_base, int *to_base, int *from_dir, int *to_dir)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|from | origin dataset |
+|to|  destination dataset |
+|item_size |  defines an iteration size (number of indices to iterate over in each direction) |
+|from_base |  indices of starting point in \"from\" dataset|
+|to_base | indices of starting point in \"to\" dataset |
+|from_dir | direction of incrementing for \"from\" for each dimension of `iter_size` |
+|to_dir |  direction of incrementing for \"to\" for each dimension of `iter_size`|
+
+A from_dir \[1,2\] and a to_dir \[2,1\] means that x in the first block
+goes to y in the second block, and y in first block goes to x in second
+block. A negative sign indicates that the axis is flipped. (Simple
+example: a transfer from (1:2,0:99,0:99) to (-1:0,0:99,0:99) would use
+iter_size = \[2,100,100\], from_base = \[1,0,0\], to_base = \[-1,0,0\],
+from_dir = \[0,1,2\], to_dir = \[0,1,2\]. In more complex case this
+allows for transfers between blocks with different orientations.)
+
+##### OPS_instance::decl_halo (C++)
+The method accepts same arguments with its C counterpart.
+
+##### ops_decl_halo_hdf5 (C)
+
+__ops_halo ops_decl_halo_hdf5(ops_dat from, ops_dat to, char* file)__
+
+This routine reads in a halo relationship between two datasets defined on two different blocks from a named HDF5 file
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|from|      origin dataset|
+|to|        destination dataset|
+|file|      hdf5 file to read and obtain the data from|
+
+##### ops_decl_halo_group (C)
+
+__ops_halo_group ops_decl_halo_group(int nhalos, ops_halo *halos)__
+
+This routine defines a collection of halos. Semantically, when an exchange is triggered for all halos in a group, there is no order defined in which they are carried out.
+| Arguments      | Description |
+| ----------- | ----------- |
+|nhalos|         number of halos in *halos* |
+|halos|           array of halos|
+
+##### OPS_instance::decl_halo_group (C++)
+
+The method accepts same arguments with its C counterpart.
+
+#### Reduction handle
+##### ops_decl_reduction_handle (C)
+
+__ops_reduction ops_decl_reduction_handle(int size, char *type, char *name)__
+This routine defines a reduction handle to be used in a parallel loop
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|size|      size of data in bytes |
+|type|          the name of type used for output diagnostics (e.g. ``double``,``float``) |
+|name|          name of the dat used for output diagnostics|
+
+__{void ops_reduction_result(ops_reduction handle, T *result)
+{This routine returns the reduced value held by a reduction handle. When OPS uses lazy execution, this will trigger the execution of all previously queued OPS operations.}
+
+|handle|  the *ops_reduction* handle |
+|result|  a pointer to write the results to, memory size has to match the declared |
+
+##### OPS_instance::decl_reduction_handle (C++)
+The method accepts same arguments with its C counterpart.
+#### Partition
+##### ops_partition (C)
+
+__ops_partition(char *method)__
+
+Triggers a multi-block partitioning across a distributed memory set of processes. (links to a dummy function for single node parallelizations). This routine should only be called after all the ops_halo ops_decl_block
+and ops_halo ops_decl_dat statements have been declared
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|method| string describing the partitioning method. Currently this string is not used internally, but is simply a place-holder to indicate different partitioning methods in the future. |
+
+
+##### OPS_instance::partition (C++)
+
+The method accepts same arguments with its C counterpart.
+### Diagnostic and output routines
+
+#### ops_diagnostic_output (C)
+
+__void ops_diagnostic_output()__
+
+This routine prints out various useful bits of diagnostic info about sets, mappings and datasets. Usually used right
+after an ops_partition() call to print out the details of the decomposition
+
+#### OPS_instance::diagnostic_output (C++)
+Same to the C counterpart.
+#### ops_printf
+
+__void ops_printf(const char * format, ...)__
+
+This routine simply prints a variable number of arguments; it is created is in place of the standard C
+printf function which would print the same on each MPI process
+
+#### ops_timers
+
+__void ops_timers(double *cpu, double *et)__
+ gettimeofday() based timer to start/end timing blocks of code
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|cpu|  variable to hold the CPU time at the time of invocation|
+|et| variable to hold the elapsed time at the time of invocation|
+
+#### ops_fetch_block_hdf5_file
+
+__void ops_fetch_block_hdf5_file(ops_block block, char *file)__
+
+Write the details of an ops_block to a named HDF5 file. Can be used over MPI (puts the data in an ops_dat into an
+HDF5 file using MPI I/O)
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|block|  ops_block to be written|
+|file|     hdf5 file to write to|
+
+#### ops_fetch_stencil_hdf5_file
+
+__void ops_fetch_stencil_hdf5_file(ops_stencil stencil, char *file)__
+
+Write the details of an ops_block to a named HDF5 file. Can be used over MPI (puts the data in an ops_dat into an HDF5 file using MPI I/O)
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|stencil|  ops_stencil to be written
+|file|     hdf5 file to write to
+
+#### ops_fetch_dat_hdf5_file
+
+__void ops_fetch_dat_hdf5_file(ops_dat dat, const char *file)__
+
+Write the details of an ops_block to a named HDF5 file. Can be used over MPI (puts the data in an ops_dat into an
+HDF5 file using MPI I/O)
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|  ops_dat to be written|
+|file|     hdf5 file to write to|
+
+#### ops_print_dat_to_txtfile
+
+__void ops_print_dat_to_txtfile(ops_dat dat, chat *file)__
+Write the details of an ops_block to a named text file. When used under an MPI parallelization each MPI process
+will write its own data set separately to the text file. As such it does not use MPI I/O. The data can be viewed using
+a simple text editor
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|  ops_dat to to be written|
+|file|     text file to write to|
+
+#### ops_timing_output
+
+__void ops_timing_output(FILE *os)__
+
+Print OPS performance performance details to output stream
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|os|    output stream, use stdout to print to standard out|
+
+#### ops_NaNcheck
+
+__void ops_NaNcheck(ops_dat dat)__
+
+Check if any of the values held in the *dat* is a NaN. If a NaN
+is found, prints an error message and exits.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|  ops_dat to to be checked|
+
+### Halo exchange
+
+#### ops_halo_transfer (C)
+
+__void ops_halo_transfer(ops_halo_group group)__
+
+This routine exchanges all halos in a halo group and will block execution of subsequent computations that depend on
+the exchanged data.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|group|         the halo group|
+
+### Parallel loop syntax
+
+A parallel loop with N arguments has the following syntax:
+
+#### ops_par_loop
+
+__void ops_par_loop(void (*kernel)(...),char *name, ops_block block, int dims, int *range, ops_arg arg1,ops_arg arg2, ..., ops_arg argN )__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|kernel|     user's kernel function with N arguments|
+|name|       name of kernel function, used for output diagnostics|
+|block|      the ops_block over which this loop executes|
+|dims|       dimension of loop iteration|
+|range|      iteration range array|
+|args|       arguments|
+
+The **ps_arg** arguments in **ops_par_loop** are provided by one of the
+following routines, one for global constants and reductions, and the other
+for OPS datasets.
+
+#### ops_arg_gbl
+
+__ops_arg ops_arg_gbl(T *data, int dim, char *type, ops_access acc)__
+
+Passes a scalar or small array that is invariant of the iteration space (not to be confused with ops_decl_const, which facilitates global scope variables).
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|data|       data array|
+|dim|        array dimension|
+|type|       string representing the type of data held in data|
+|acc|        access type|
+
+#### ops_arg_reduce
+
+__ops_arg ops_arg_reduce(ops_reduction handle, int dim, char *type, ops_access acc)__
+
+Passes a pointer to a variable that needs to be incremented (or swapped for min/max reduction) by the user kernel.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|handle|       an  *ops_reduction* handle|
+|dim|        array dimension (according to *type*)|
+|type|       string representing the type of data held in data|
+|acc|        access type|
+
+#### ops_arg_dat
+
+__ops_arg ops_arg_dat(ops_dat dat, ops_stencil stencil, char *type,ops_access acc)__
+
+Passes a pointer wrapped in ac ACC<> object to the value(s) at the current grid point to the user kernel. The ACC object's parentheses operator has to be used for dereferencing the pointer.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|        dataset|
+|stencil|    stencil for accessing data|
+|type|       string representing the type of data held in dataset|
+|acc|        access type|
+
+#### ops_arg_idx
+
+__ops_arg ops_arg_idx()__
+
+Give you an array of integers (in the user kernel) that have the index of
+the current grid point, i.e. idx[0] is the index in x, idx[1] is the index in y, etc. This is a globally consistent
+index, so even if the block is  distributed across different MPI partitions, it gives you the same indexes. Generally
+used to generate initial geometry.
+
+### Stencils
+
+The final ingredient is the stencil specification, for which we have two versions: simple and strided.
+
+#### ops_decl_stencil (C)
+
+__ops_stencil ops_decl_stencil(int dims,int points, int *stencil, char *name)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dims|     dimension of loop iteration|
+|points|   number of points in the stencil|
+|stencil|  stencil for accessing data|
+|name| string representing the name of the stencil|
+
+#### OPS_instance::decl_stencil (C++)
+
+The method accepts same arguments with its C counterpart.
+#### ops_decl_strided_stencil (C)
+
+__ops_stencil ops_decl_strided_stencil(int dims, int points, int *stencil, int *stride, char *name)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dims|       dimension of loop iteration|
+|points|     number of points in the stencil|
+|stencil|    stencil for accessing data|
+|stride|     stride for accessing data|
+|name| string representing the name of the stencil|
+
+#### OPS_instance::decl_strided_stencil (C++)
+
+The method accepts same arguments with its C counterpart.
+
+#### ops_decl_stencil_hdf5
+
+__ops_stencil ops_decl_stencil_hdf5(int dims,int points, char *name, char* file)__
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dims|     dimension of loop iteration|
+|points|   number of points in the stencil|
+|name|     string representing the name of the stencil|
+|file|     hdf5 file to write to|
+
+ In the strided case, the semantics for the index of data to be
+accessed, for stencil point*p*, in dimension *m* are defined as
+
+```c++
+ stride[m]*loop_index[m] + stencil[p*dims+m]
+```
+
+where ``loop_index[m]`` is the iteration index (within the
+user-defined iteration space) in the different dimensions.
+
+If, for one or more dimensions, both ``stride[m]`` and
+``stencil[p*dims+m]`` are zero, then one of the following must be true;
+
+* the dataset being referenced has size 1 for these dimensions
+
+* these dimensions are to be omitted and so the dataset has
+dimension equal to the number of remaining dimensions.
+
+See *OPS/apps/c/CloverLeaf/build_field.cpp* and *OPS/apps/c/CloverLeaf/generate.cpp* for an example *ops_decl_strided_stencil* declaration and its use in a loop,respectively.
+
+These two stencil definitions probably take care of all of the
+cases in the Introduction except for multiblock applications with interfaces
+with different orientations -- this will need a third, even more general,
+stencil specification. The strided stencil will handle both multigrid
+(with a stride of 2 for example) and the boundary condition and reduced
+dimension applications (with a stride of 0 for the relevant dimensions).
+
+### Checkpointing
+
+OPS supports the automatic checkpointing of applications. Using the API below, the user specifies the file name for the checkpoint and an average time interval between checkpoints, OPS will then automatically save all necessary information periodically that is required to fast-forward to the last checkpoint if a crash occurred. Currently, when re-launching after a crash, the same number of MPI processes have to be used. To enable checkpointing mode, the *OPS_CHECKPOINT* runtime argument has to be used. (**Do we also need to define the CHECKPOINTING compiler directive?**)
+
+#### ops_checkpointing_init
+
+__bool ops_checkpointing_init(const char *filename, double interval, int options)__
+
+Initialises the checkpointing system, has to be called after *ops_partition*. Returns true if the application launches in restore
+mode, false otherwise.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|filename| name of the file for checkpointing. In MPI, this will automatically be post-fixed with the rank ID.|
+|interval| average time (seconds) between checkpoints|
+|options| a combinations of flags, listed in *ops_checkpointing.h*, also see below|
+
+* OPS_CHECKPOINT_INITPHASE - indicates that there are a number of parallel loops at the very beginning of the simulations which should be excluded from any checkpoint; mainly because they initialise datasets that do not change during the main body of the execution. During restore mode these loops are executed as usual. An example would be the computation of the mesh geometry, which can be excluded from the checkpoint if it is re-computed when recovering and restoring a checkpoint. The API call *void ops_checkpointing_initphase_done()* indicates the end of this initial phase.
+
+* OPS_CHECKPOINT_MANUAL_DATLIST - Indicates that the user manually controls the location of the checkpoint, and explicitly specifies the list of *ops_dat*s to be saved.
+
+* OPS_CHECKPOINT_FASTFW - Indicates that the user manually controls the location of the checkpoint, and it also enables fast-forwarding, by skipping the execution of the application (even though none of the parallel loops would actually execute, there may be significant work outside of those) up to the checkpoint
+
+* OPS_CHECKPOINT_MANUAL - Indicates that when the corresponding API function is called, the checkpoint should be created. Assumes the presence of the above two options as well.
+
+#### ops_checkpointing_manual_datlist
+
+__void ops_checkpointing_manual_datlist(int ndats, ops_dat *datlist)__
+
+A user can call this routine at a point in the code to mark the location of a checkpoint.  At this point, the list of datasets specified
+will be saved. The validity of what is saved is not checked by the checkpointing algorithm assuming that the user knows
+what data sets to be saved for full recovery. This routine should be called frequently (compared to check-pointing frequency) and it will trigger the creation of the checkpoint the first time it is called after the timeout occurs.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndats| number of datasets to be saved|
+|datlist| arrays of *ops_dat* handles to be saved|
+
+#### ops_checkpointing_fastfw
+
+__bool ops_checkpointing_fastfw(int nbytes, char *payload)__
+
+A use can call this routine at a point in the code to mark the location of a checkpoint.  At this point, the
+specified payload (e.g. iteration count, simulation time, etc.) along with the necessary datasets, as determined by the
+checkpointing algorithm will be saved. This routine should be called frequently (compared to checkpointing frequency),
+will trigger the creation of the checkpoint the first time it is called after the timeout occurs. In restore mode,
+will restore all datasets the first time it is called, and returns true indicating that the saved payload is returned
+in payload. Does not save reduction data.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|nbytes| size of the payload in bytes|
+|payload| pointer to memory into which the payload is packed|
+
+#### ops_checkpointing_manual_datlist_fastfw
+
+__bool ops_checkpointing_manual_datlist_fastfw(int ndats, op_dat *datlist, int nbytes, char *payload)__
+
+Combines the manual datlist and fastfw calls.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndats| number of datasets to be saved|
+|datlist| arrays of *ops_dat* handles to be saved|
+|nbytes| size of the payload in bytes|
+|payload| pointer to memory into which the payload is packed|
+
+#### ops_checkpointing_manual_datlist_fastfw_trigger
+
+__bool ops_checkpointing_manual_datlist_fastfw_trigger(int ndats, opa_dat *datlist, int
+nbytes, char *payload)__
+
+With this routine it is possible to manually trigger checkpointing, instead of relying on the timeout process. as such
+it combines the manual datlist and fastfw calls, and triggers the creation of a checkpoint when called.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndats| number of datasets to be saved|
+|datlist| arrays of *ops_dat* handles to be saved|
+|nbytes| size of the payload in bytes|
+|payload| pointer to memory into which the payload is packed|
+
+The suggested use of these **manual** functions is of course when the optimal location for checkpointing
+is known - one of the ways to determine that is to use the built-in algorithm. More details of this will be reported
+in a tech-report on checkpointing, to be published later.
+
+### Access to OPS data
+
+This section describes APIs that give the user access to internal data structures in OPS and return data to user-space. These should be used cautiously and sparsely, as they can affect performance significantly
+
+#### ops_dat_get_local_npartitions (C)
+
+__int ops_dat_get_local_npartitions(ops_dat dat)__
+
+This routine returns the number of chunks of the given dataset held by the current process.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+
+#### ops_dat_core::get_local_npartitions (C++)
+The C++ version of ``ops_dat_get_local_npartitions``, which does not require input.
+#### ops_dat_get_global_npartitions (C)
+
+__int ops_dat_get_global_npartitions(ops_dat dat)__
+
+This routine returns the number of chunks of the given dataset held by all processes.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset
+
+#### ops_dat_core::get_global_npartitions (C++)
+The C++ version of ``ops_dat_get_global_npartitions``, which does not require input.
+#### ops_dat_get_extents (C)
+
+__void ops_dat_get_extents(ops_dat dat, int part, int *disp, int *sizes)__
+
+This routine returns the MPI displacement and size of a given chunk of the given dataset on the current process.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|disp|        an array populated with the displacement of the chunk within the ``global'' distributed array|
+|sizes|       an array populated with the spatial extents|
+
+#### ops_dat_core::get_extents (C++)
+The C++ version of ``ops_dat_get_extents`` where the arguments are the same except no need of the ops_dat arguments.
+
+#### ops_dat_get_raw_metadata (C)
+
+__char* ops_dat_get_raw_metadata(ops_dat dat, int part, int *disp, int *size, int *stride, int *d_m, int *d_p)__
+
+This routine returns array shape metadata corresponding to the ops_dat. Any of the arguments that are not of interest, may be NULL.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|disp|        an array populated with the displacement of the chunk within the ``global'' distributed array|
+|size|       an array populated with the spatial extents
+|stride|      an array populated strides in spatial dimensions needed for column-major indexing|
+|d_m|      an array populated with padding on the left in each dimension. Note that these are negative values|
+|d_p|      an array populated with padding on the right in each dimension|
+
+#### ops_dat_core::get_raw_metadata (C++)
+The C++ version of ``ops_dat_get_raw_metadata`` where the arguments are the same except no need of the ops_dat arguments.
+#### ops_dat_get_raw_pointer (C)
+
+__char* ops_dat_get_raw_pointer(ops_dat dat, int part, ops_stencil stencil, ops_memspace *memspace)__
+
+This routine returns a pointer to the internally stored data, with MPI halo regions automatically updated as required by the supplied stencil. The strides required to index into the dataset are also given.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|stencil|     a stencil used to determine required MPI halo exchange depths|
+|memspace|       when set to OPS_HOST or OPS_DEVICE, returns a pointer to data in that memory space, otherwise must be set to 0, and returns whether data is in the host or on the device|
+
+#### ops_dat_core::get_raw__pointer (C++)
+The C++ version of ``ops_dat_get_raw_pointer`` where the arguments are the same except no need of the ops_dat arguments.
+#### ops_dat_release_raw_data (C)
+
+__void ops_dat_release_raw_data(ops_dat dat, int part, ops_access acc)__
+
+Indicates to OPS that a dataset previously accessed with ops_dat_get_raw_pointer is released by the user, and also tells OPS how it was accessed.
+
+A single call to ops_dat_release_raw_data() releases all pointers obtained by previous calls to ops_dat_get_raw_pointer() calls on the same dat and with the same *memspace argument, i.e. calls do not nest.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset
+|part|        the chunk index (has to be 0)|
+|acc|     the kind of access that was used by the user (OPS_READ if it was read only, OPS_WRITE if it was overwritten, OPS_RW if it was read and written)|
+
+#### ops_dat_core::_release_raw_data (C++)
+The C++ version of ``ops_dat_release_raw_data`` where the arguments are the same except no need of the ops_dat arguments.
+#### ops_dat_fetch_data (C)
+
+__void ops_dat_fetch_data(ops_dat dat, int part, int *data)__
+
+This routine copies the data held by OPS to the user-specified memory location, which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0) |
+|data|        pointer to memory which should be filled by OPS|
+
+#### ops_dat_fetch_data_memspace (C)
+
+__void ops_dat_fetch_data_memspace(ops_dat dat, int part, char *data, ops_memspace memspace)__
+
+This routine copies the data held by OPS to the user-specified memory location, as which needs to be at least as large as indicated by the sizes parameter of ops_dat_get_extents.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0) |
+|data|        pointer to memory which should be filled by OPS|
+| memspace |the memory space where the data pointer is|
+#### ops_dat_core::fetch_data (C++)
+The C++ version of ``ops_dat_fetch_data_memspace`` where the arguments the same except no need of the ops_dat arguments.
+#### ops_dat_set_data (C)
+
+__void ops_dat_set_data(ops_dat dat, int part, int *data)__
+
+This routine copies the data given  by the user to the internal data structure used by OPS. User data needs to be laid out in column-major order and strided as indicated by the sizes parameter of ops_dat_get_extents.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|data|        pointer to memory which should be copied to OPS |
+
+
+#### ops_dat_set_data_memspace (C)
+
+__void ops_dat_set_data_memspace(ops_dat dat, int part, char *data, ops_memspace memspace)__
+
+This routine copies the data given  by the user to the internal data structure used by OPS. User data needs to be laid out in column-major order and strided as indicated by the sizes parameter of ops_dat_get_extents.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|dat|         the dataset|
+|part|        the chunk index (has to be 0)|
+|data|        pointer to memory which should be copied to OPS |
+|memspace| the memory space where the data pointer is|
+
+#### ops_dat_core::set_data (C++)
+The C++ version of ``ops_dat_set_data_memspace`` where the arguments the same except no need of the ops_dat arguments.
+### Linear algebra solvers
+
+####  Tridiagonal solver
+This section specifies APIs that allow [Tridsolver](https://github.com/OP-DSL/tridsolver) (a tridiagonal solver library) to be called from OPS. The library can be used to solve a large number of tridiagonal systems of equations stored in multidimensional datasets. Parameters that are passed to Tridsolver from OPS are stored in an `ops_tridsolver_params` object. The constructor for this class takes the `ops_block` that the datasets are defined over as an argument and optionally also a solving strategy to use (only relevant to MPI applications). The following solving strategies are available (see Tridsolver for more details about these):
+
+- GATHER_SCATTER (not available for GPUs)
+- ALLGATHER
+- LATENCY_HIDING_TWO_STEP
+- LATENCY_HIDING_INTERLEAVED
+- JACOBI
+- PCR (default)
+
+Then parameters specific to different solving strategies can be set using setter methods. For applications using MPI, it is beneficial to reuse `ops_tridsolver_params` objects between solves as much as possible due to set up times involved with creating Tridsolver's MPI communicators.
+
+##### ops_tridMultiDimBatch
+
+__void ops_tridMultiDimBatch(int ndim, int solvedim, int* dims, ops_dat a, ops_dat b, ops_dat c, ops_dat d, ops_tridsolver_params *tridsolver_ctx)__
+
+This solves multiple tridiagonal systems of equations in multidimensional datasets along the specified dimension. The matrix is stored in the `a` (bottom diagonal), `b` (central diagonal) and `c` (top diagonal) datasets. The right hand side is stored in the `d` dataset and the result is also written to this dataset.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndim| the dimension of the datasets |
+|solvedim| the dimension to solve along |
+|dims| the size of each dimension (excluding any padding) |
+|a| the dataset for the lower diagonal |
+|b| the dataset for the central diagonal |
+|c| the dataset for the upper diagonal |
+|d| the dataset for the right hand side, also where the solution is written to |
+|tridsolver_ctx| an object containing the parameters for the Tridsolver library |
+
+##### ops_tridMultiDimBatch_Inc
+
+__void ops_tridMultiDimBatch(int ndim, int solvedim, int* dims, ops_dat a, ops_dat b, ops_dat c, ops_dat d, ops_dat u, ops_tridsolver_params *tridsolver_ctx)__
+
+This solves multiple tridiagonal systems of equations in multidimensional datasets along the specified dimension. The matrix is stored in the `a` (bottom diagonal), `b` (central diagonal) and `c` (top diagonal) datasets. The right hand side is stored in the `d` dataset and the result is added to the `u` dataset.
+
+| Arguments      | Description |
+| ----------- | ----------- |
+|ndim| the dimension of the datasets |
+|solvedim| the dimension to solve along |
+|dims| the size of each dimension (excluding any padding) |
+|a| the dataset for the lower diagonal |
+|b| the dataset for the central diagonal |
+|c| the dataset for the upper diagonal |
+|d| the dataset for the right hand side |
+|u| the dataset that the soluion is added to |
+|tridsolver_ctx| an object containing the parameters for the Tridsolver library |
+
+## Runtime Flags and Options
+
+The following is a list of all the runtime flags and options that can be used when executing OPS generated applications.
+
+* `OPS_DIAGS=` : set OPS diagnostics level at runtime. 
+
+  `OPS_DIAGS=1` - no diagnostics, default level to achieve the best runtime performance.
+   
+  `OPS_DIAGS>1` - print block decomposition and `ops_par_loop` timing breakdown.
+  
+  `OPS_DIAGS>4` - print intra-block halo buffer allocation feedback (for OPS internal development only).
+  
+  `OPS_DIAGS>5` - check if intra-block halo MPI sends depth match MPI receives depth (for OPS internal development only).  
+  
+* `OPS_BLOCK_SIZE_X=`, `OPS_BLOCK_SIZE_Y=` and `OPS_BLOCK_SIZE_Y=` : The CUDA (and OpenCL) thread block sizes in X, Y and Z dimensions. The sizes should be an integer between 1 - 1024, and currently they should be selected such that `OPS_BLOCK_SIZE_X`*`OPS_BLOCK_SIZE_Y`*`OPS_BLOCK_SIZE_Z`< 1024
+
+* `-gpudirect` : Enable GPU direct support when executing MPI+CUDA executables. 
+
+* `OPS_CL_DEVICE=` : Select the OpenCL device for execution. Usually `OPS_CL_DEVICE=0` selects the CPU and `OPS_CL_DEVICE=1` selects GPUs. The selected device will be reported by OPS during execution.
+
+* `OPS_TILING` : Execute OpenMP code with cache blocking tiling. See the [Performance Tuning](https://github.com/OP-DSL/OPS/blob/MarkdownDocDev/doc/perf.md) section. 
+* `OPS_TILING_MAXDEPTH=` : Execute MPI+OpenMP code with cache blocking tiling and further communication avoidance. See the [Performance Tuning](https://github.com/OP-DSL/OPS/blob/MarkdownDocDev/doc/perf.md) section. 
+
+## Doxygen
+Doxygen generated from OPS source can be found [here](https://op-dsl-ci.gitlab.io/ops-ci/).
diff --git a/doc/perf.md b/doc/perf.md
new file mode 100644
index 0000000000..1cec2e145f
--- /dev/null
+++ b/doc/perf.md
@@ -0,0 +1,57 @@
+# Performance Tuning
+
+<!--## Vectorization-->
+
+## Executing with GPUDirect
+
+GPU direct support for MPI+CUDA, to enable (on the OPS side) add
+**-gpudirect** when running the executable. You may also have to use
+certain environmental flags when using different MPI distributions. For
+an example of the required flags and environmental settings on the
+Cambridge Wilkes2 GPU cluster see:\
+<https://docs.hpc.cam.ac.uk/hpc/user-guide/performance-tips.html>
+## Cache-blocking Tiling
+OPS has a code generation (ops_gen_mpi_lazy) and build target for
+tiling. Once compiled, to enable, use the `OPS_TILING` runtime parameter. This will look at the L3 cache size of your CPU and guess the correct
+tile size. If you want to alter the amount of cache to be used for the
+guess, use the ``OPS_CACHE_SIZE=XX`` runtime parameter, where the value is
+in Megabytes. To manually specify the tile sizes, use the
+``OPS_TILESIZE_X``, ``OPS_TILESIZE_Y``, and ``OPS_TILESIZE_Z`` runtime arguments.
+
+When MPI is combined with OpenMP tiling can be extended to the MPI
+halos. Set `OPS_TILING_MAXDEPTH` to increase the the halo depths so that
+halos for multiple `ops_par_loops` can be exchanged with a single MPI
+message (see [TPDS2017](https://ieeexplore.ieee.org/abstract/document/8121995) for more details)\
+To test, compile CloverLeaf under ``OPS/apps/c/CloverLeaf``, modify clover.in
+to use a $6144^2$ mesh, then run as follows:\
+For OpenMP with tiling:
+```bash
+export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING
+```
+For MPI+OpenMP with tiling:
+```bash
+export OMP_NUM_THREADS=xx; mpirun -np xx ./cloverleaf_mpi_tiled OPS_TILING OPS_TILING_MAXDEPTH=6
+```
+To manually specify the tile sizes (in number of grid points), use the
+OPS_TILESIZE_X, OPS_TILESIZE_Y, and OPS_TILESIZE_Z runtime arguments:
+```bash
+export OMP_NUM_THREADS=xx; numactl -physnodebind=0 ./cloverleaf_tiled OPS_TILING OPS_TILESIZE_X=600 OPS_TILESIZE_Y=200
+```
+## OpenMP and OpenMP+MPI
+It is recommended that you assign one MPI rank per NUMA region when executing MPI+OpenMP parallel code. Usually for a multi-CPU system a single CPU socket is a single NUMA region. Thus, for a 4 socket system, OPS's MPI+OpenMP code should be executed with 4 MPI processes with each MPI process having multiple OpenMP threads (typically specified by the `OMP_NUM_THREAD` flag). Additionally on some systems using `numactl` to bind threads to cores could give performance improvements (see `OPS/scripts/numawrap` for an example script that wraps the `numactl` command to be used with common MPI distributions). 
+
+## CUDA arguments
+The CUDA (and OpenCL) thread block sizes can be controlled by setting
+the ``OPS_BLOCK_SIZE_X``, ``OPS_BLOCK_SIZE_Y`` and ``OPS_BLOCK_SIZE_Z`` runtime
+arguments. For example,
+```bash
+./cloverleaf_cuda OPS_BLOCK_SIZE_X=64 OPS_BLOCK_SIZE_Y=4
+```
+
+## OpenCL arguments
+`OPS_CL_DEVICE=XX` runtime flag sets the OpenCL device to execute the
+code on.
+
+Usually `OPS_CL_DEVICE=0` selects the CPU and `OPS_CL_DEVICE=1` selects
+GPUs.
+
diff --git a/doc/pubs.md b/doc/pubs.md
new file mode 100644
index 0000000000..8c6f967998
--- /dev/null
+++ b/doc/pubs.md
@@ -0,0 +1,3 @@
+
+# Publications
+See [OP-DSL publications page](https://op-dsl.github.io/papers.html).
diff --git a/doc/quickstart.md b/doc/quickstart.md
new file mode 100644
index 0000000000..93813d43d9
--- /dev/null
+++ b/doc/quickstart.md
@@ -0,0 +1,3 @@
+# Quick start
+## How to use math
+$$\alpha$$
\ No newline at end of file
diff --git a/doc/requirement.txt b/doc/requirement.txt
new file mode 100644
index 0000000000..9af8e80a61
--- /dev/null
+++ b/doc/requirement.txt
@@ -0,0 +1,2 @@
+ # We set the tools needed by sphinx
+ myst-parser ==  0.15.2
diff --git a/ops_translator/c/ops.py b/ops_translator/c/ops.py
index e2a9ac0b39..d010286ab5 100755
--- a/ops_translator/c/ops.py
+++ b/ops_translator/c/ops.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python3
 
 # Open source copyright declaration based on BSD open source template:
 # http://www.opensource.org/licenses/bsd-license.php
@@ -643,7 +643,7 @@ def main(source_files):
               repeat = True
               which_file = nk
             else:
-              print('repeated kernel with incompatible arguments: ERROR' + kernels[nk]['name'])
+              print(('repeated kernel with incompatible arguments: ERROR' + kernels[nk]['name']))
               break
 
 
diff --git a/ops_translator/c/ops_gen_mpi_cuda.py b/ops_translator/c/ops_gen_mpi_cuda.py
index c3807981cf..9e0d9e3567 100644
--- a/ops_translator/c/ops_gen_mpi_cuda.py
+++ b/ops_translator/c/ops_gen_mpi_cuda.py
@@ -49,6 +49,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -97,7 +98,7 @@ def ops_gen_mpi_cuda(master, date, consts, kernels, soa_set):
   try:
     os.makedirs('./CUDA')
   except OSError as e:
-    if e.errno != os.errno.EEXIST:
+    if e.errno != errno.EEXIST:
       raise
   for nk in range (0,len(kernels)):
     arg_typ  = kernels[nk]['arg_type']
@@ -694,7 +695,7 @@ def ops_gen_mpi_cuda(master, date, consts, kernels, soa_set):
       code('')
 
     for n in range (0, nargs):
-      if arg_typ[n] == 'ops_arg_gbl' and accs[n] <> OPS_READ:
+      if arg_typ[n] == 'ops_arg_gbl' and accs[n] != OPS_READ:
         code('arg'+str(n)+'.data = block->instance->OPS_reduct_h + reduct_bytes;')
         code('arg'+str(n)+'.data_d = block->instance->OPS_reduct_d + reduct_bytes;')
         code('for (int b=0; b<maxblocks; b++)')
diff --git a/ops_translator/c/ops_gen_mpi_hip.py b/ops_translator/c/ops_gen_mpi_hip.py
index da5c81505c..f3c59ddbaf 100644
--- a/ops_translator/c/ops_gen_mpi_hip.py
+++ b/ops_translator/c/ops_gen_mpi_hip.py
@@ -49,6 +49,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -676,7 +677,7 @@ def ops_gen_mpi_hip(master, date, consts, kernels, soa_set):
       code('')
 
     for n in range (0, nargs):
-      if arg_typ[n] == 'ops_arg_gbl' and accs[n] <> OPS_READ:
+      if arg_typ[n] == 'ops_arg_gbl' and accs[n] != OPS_READ:
         code('arg'+str(n)+'.data = block->instance->OPS_reduct_h + reduct_bytes;')
         code('arg'+str(n)+'.data_d = block->instance->OPS_reduct_d + reduct_bytes;')
         code('for (int b=0; b<maxblocks; b++)')
diff --git a/ops_translator/c/ops_gen_mpi_inline.py b/ops_translator/c/ops_gen_mpi_inline.py
index 0d620d4161..be5c5f0d2d 100644
--- a/ops_translator/c/ops_gen_mpi_inline.py
+++ b/ops_translator/c/ops_gen_mpi_inline.py
@@ -51,6 +51,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -94,7 +95,7 @@ def ops_gen_mpi_inline(master, date, consts, kernels, soa_set):
   try:
     os.makedirs('./MPI_inline')
   except OSError as e:
-    if e.errno != os.errno.EEXIST:
+    if e.errno != errno.EEXIST:
       raise
 
   ##########################################################################
@@ -472,7 +473,7 @@ def ops_gen_mpi_inline(master, date, consts, kernels, soa_set):
     try:
       os.makedirs('./MPI_inline')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./MPI_inline/'+name+'_mpiinline_kernel_c.c','w')
     date = datetime.datetime.now()
diff --git a/ops_translator/c/ops_gen_mpi_lazy.py b/ops_translator/c/ops_gen_mpi_lazy.py
index e80af5df9f..724b5cf7d6 100755
--- a/ops_translator/c/ops_gen_mpi_lazy.py
+++ b/ops_translator/c/ops_gen_mpi_lazy.py
@@ -49,6 +49,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -108,7 +109,7 @@ def ops_gen_mpi_lazy(master, date, consts, kernels, soa_set):
   try:
     os.makedirs('./MPI_OpenMP')
   except OSError as e:
-    if e.errno != os.errno.EEXIST:
+    if e.errno != errno.EEXIST:
       raise
 
   for nk in range (0,len(kernels)):
diff --git a/ops_translator/c/ops_gen_mpi_openacc.py b/ops_translator/c/ops_gen_mpi_openacc.py
index c25559330b..0d68374e05 100644
--- a/ops_translator/c/ops_gen_mpi_openacc.py
+++ b/ops_translator/c/ops_gen_mpi_openacc.py
@@ -49,6 +49,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -97,7 +98,7 @@ def ops_gen_mpi_openacc(master, date, consts, kernels, soa_set):
   try:
     os.makedirs('./OpenACC')
   except OSError as e:
-    if e.errno != os.errno.EEXIST:
+    if e.errno != errno.EEXIST:
       raise
   for nk in range (0,len(kernels)):
     arg_typ  = kernels[nk]['arg_type']
diff --git a/ops_translator/c/ops_gen_mpi_opencl.py b/ops_translator/c/ops_gen_mpi_opencl.py
index 4ba4875dc2..3b995fcda2 100644
--- a/ops_translator/c/ops_gen_mpi_opencl.py
+++ b/ops_translator/c/ops_gen_mpi_opencl.py
@@ -50,6 +50,7 @@
 
 import re
 import datetime
+import errno
 import os
 import glob
 
@@ -107,7 +108,7 @@ def ops_gen_mpi_opencl(master, date, consts, kernels, soa_set):
   try:
     os.makedirs('./OpenCL')
   except OSError as e:
-    if e.errno != os.errno.EEXIST:
+    if e.errno != errno.EEXIST:
       raise
   for nk in range (0,len(kernels)):
     arg_typ  = kernels[nk]['arg_type']
@@ -498,7 +499,7 @@ def ops_gen_mpi_opencl(master, date, consts, kernels, soa_set):
     try:
       os.makedirs('./OpenCL')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./OpenCL/'+name+'.cl','w')
     date = datetime.datetime.now()
diff --git a/ops_translator/fortran/ops_fortran.py b/ops_translator/fortran/ops_fortran.py
index c0abf8a5e1..b811befab2 100755
--- a/ops_translator/fortran/ops_fortran.py
+++ b/ops_translator/fortran/ops_fortran.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Open source copyright declaration based on BSD open source template:
 # http://www.opensource.org/licenses/bsd-license.php
diff --git a/ops_translator/fortran/ops_fortran_gen_mpi.py b/ops_translator/fortran/ops_fortran_gen_mpi.py
index 44f2cb97f1..c6e0413258 100644
--- a/ops_translator/fortran/ops_fortran_gen_mpi.py
+++ b/ops_translator/fortran/ops_fortran_gen_mpi.py
@@ -50,6 +50,7 @@
 
 import re
 import datetime
+import errno
 import os
 
 import util_fortran
@@ -484,7 +485,7 @@ def ops_fortran_gen_mpi(master, date, consts, kernels):
     try:
       os.makedirs('./MPI')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./MPI/'+name+'_seq_kernel.F90','w')
     date = datetime.datetime.now()
diff --git a/ops_translator/fortran/ops_fortran_gen_mpi_cuda.py b/ops_translator/fortran/ops_fortran_gen_mpi_cuda.py
index 13b19a3f18..42cbdcaafd 100644
--- a/ops_translator/fortran/ops_fortran_gen_mpi_cuda.py
+++ b/ops_translator/fortran/ops_fortran_gen_mpi_cuda.py
@@ -50,6 +50,7 @@
 
 import re
 import datetime
+import errno
 import os
 
 import util_fortran
@@ -1099,7 +1100,7 @@ def ops_fortran_gen_mpi_cuda(master, date, consts, kernels):
     try:
       os.makedirs('./CUDA')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./CUDA/'+name+'_cuda_kernel.CUF','w')
     date = datetime.datetime.now()
diff --git a/ops_translator/fortran/ops_fortran_gen_mpi_openacc.py b/ops_translator/fortran/ops_fortran_gen_mpi_openacc.py
index cef0bf362b..854cbdf5a0 100644
--- a/ops_translator/fortran/ops_fortran_gen_mpi_openacc.py
+++ b/ops_translator/fortran/ops_fortran_gen_mpi_openacc.py
@@ -50,6 +50,7 @@
 
 import re
 import datetime
+import errno
 import os
 
 import util_fortran
@@ -565,7 +566,7 @@ def ops_fortran_gen_mpi_openacc(master, date, consts, kernels):
     try:
       os.makedirs('./MPI_OpenACC')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./MPI_OpenACC/'+name+'_openacc_kernel.F90','w')
     date = datetime.datetime.now()
diff --git a/ops_translator/fortran/ops_fortran_gen_mpi_openmp.py b/ops_translator/fortran/ops_fortran_gen_mpi_openmp.py
index 5191d657fb..34903d9a58 100644
--- a/ops_translator/fortran/ops_fortran_gen_mpi_openmp.py
+++ b/ops_translator/fortran/ops_fortran_gen_mpi_openmp.py
@@ -49,6 +49,7 @@
 """
 
 import re
+import errno
 import datetime
 import os
 
@@ -515,7 +516,7 @@ def ops_fortran_gen_mpi_openmp(master, date, consts, kernels):
     try:
       os.makedirs('./MPI_OpenMP')
     except OSError as e:
-      if e.errno != os.errno.EEXIST:
+      if e.errno != errno.EEXIST:
         raise
     fid = open('./MPI_OpenMP/'+name+'_omp_kernel.F90','w')
     date = datetime.datetime.now()
diff --git a/scripts/numawrap b/scripts/numawrap
new file mode 100644
index 0000000000..e36467b975
--- /dev/null
+++ b/scripts/numawrap
@@ -0,0 +1,37 @@
+#!/bin/bash
+# e.g. mpirun -np 4 numawrap ./application
+
+# Find the rank of the process from the MPI local rank environment variable
+# to ensure unique output filenames.
+if [[ -n ${OMPI_COMM_WORLD_LOCAL_RANK} ]]
+    then
+    let lrank=${OMPI_COMM_WORLD_LOCAL_RANK}
+elif [[ -n ${MV2_COMM_WORLD_LOCAL_RANK} ]]
+    then
+    let lrank=${MV2_COMM_WORLD_LOCAL_RANK}
+elif [[ -n ${PMI_RANK} ]]
+    then
+    let lrank=${PMI_RANK}
+elif [[ -n ${PMI_ID} && -n ${MPISPAWN_LOCAL_NPROCS} ]]
+    then
+    let lrank=${PMI_ID}%${PERHOST}
+elif [[ -n ${MPI_LOCALRANKID} ]]
+    then
+    let lrank=${MPI_LOCALRANKID}
+else
+    echo could not determine local rank
+fi
+
+export CUDA_VISIBLE_DEVICES=${lrank}
+
+# let lrank=${PMI_RANK}
+echo $lrank
+
+# use  $lrank -lt 2 and  $lrank -ge 2 to distribute and bind 4 procs on to 2 CPUs
+if [[ $lrank -lt 2 ]]; then
+    numactl --cpunodebind=0 ${@}
+fi
+
+if [[ $lrank -ge 2 ]]; then
+    numactl --cpunodebind=1 ${@}
+fi
diff --git a/setup.py b/setup.py
index 0b0fbbc774..94c56f8368 100644
--- a/setup.py
+++ b/setup.py
@@ -5,8 +5,8 @@
 setup(name='ops',
       version='dev',
       description='OPS is an API with associated libraries and preprocessors to generate parallel executables for applications on mulit-block structured meshes.',
-      author='Mike Giles, Istvan Reguly, Gihan Mudalige, and others',
-      url='http://www.oerc.ox.ac.uk/projects/ops',
+      author='Gihan Mudalige, Istvan Reguly, Mike Giles, and others',
+      url='https://op-dsl.github.io/',
       packages=['ops_translator', 'ops_translator.c', 'ops_translator.fortran'],
       scripts=[],
       classifiers=[