diff --git a/Common/include/fem_geometry_structure.hpp b/Common/include/fem_geometry_structure.hpp
index 221d04a66b7e..5fbde9b9266b 100644
--- a/Common/include/fem_geometry_structure.hpp
+++ b/Common/include/fem_geometry_structure.hpp
@@ -1038,18 +1038,6 @@ class CMeshFEM_DG: public CMeshFEM {
   */
   void SetSendReceive(CConfig *config);
 
-  /*!
-   * \brief Set the value of the total number of points globally in the simulation.
-   * \param[in] val_global_npoint - Global number of points in the mesh (excluding halos).
-   */
-  void SetGlobal_nPointDomain(unsigned long val_global_npoint);
-
-  /*!
-   * \brief Retrieve total number of nodes in a simulation across all processors (excluding halos).
-   * \return Total number of nodes in a simulation across all processors (excluding halos).
-   */
-  unsigned long GetGlobal_nPointDomain() const override;
-
   /*!
    * \brief Set the local index that correspond with the global numbering index.
    */
@@ -1063,8 +1051,7 @@ class CMeshFEM_DG: public CMeshFEM {
   long GetGlobal_to_Local_Point(unsigned long val_ipoint) const override;
 
   /*!
-   * \brief Function, which carries out the preprocessing tasks
-            when wall functions are used.
+   * \brief Function, which carries out the preprocessing tasks when wall functions are used.
    * \param[in] config - Definition of the particular problem.
    */
   void WallFunctionPreprocessing(CConfig *config);
diff --git a/Common/include/fem_geometry_structure.inl b/Common/include/fem_geometry_structure.inl
index b00d06e66bd9..f3daa02859f7 100644
--- a/Common/include/fem_geometry_structure.inl
+++ b/Common/include/fem_geometry_structure.inl
@@ -144,10 +144,6 @@ inline CMeshFEM_DG::CMeshFEM_DG(void) : CMeshFEM() { }
 
 inline CMeshFEM_DG::~CMeshFEM_DG(void) { }
 
-inline void CMeshFEM_DG::SetGlobal_nPointDomain(unsigned long val_global_npoint) { Global_nPointDomain =  val_global_npoint; }
-
-inline unsigned long CMeshFEM_DG::GetGlobal_nPointDomain(void) const { return Global_nPointDomain; }
-
 inline void CMeshFEM_DG::SetGlobal_to_Local_Point(void) {
   Global_to_Local_Point.clear();
   unsigned long ii = 0;
diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
index 0131d2e6fbac..147ecd6fb490 100644
--- a/Common/include/geometry/CGeometry.hpp
+++ b/Common/include/geometry/CGeometry.hpp
@@ -53,6 +53,7 @@ extern "C" {
 #include "../dual_grid_structure.hpp"
 #include "../config_structure.hpp"
 #include "../geometry_structure_fem_part.hpp"
+#include "../toolboxes/graph_toolbox.hpp"
 
 using namespace std;
 
@@ -99,30 +100,42 @@ class CGeometry {
   nelem_quad_bound,             /*!< \brief Number of quads on the mesh boundaries. */
   Global_nelem_quad_bound;      /*!< \brief Total number of quads on the mesh boundaries across all processors. */
 
-  unsigned short nDim,	        /*!< \brief Number of dimension of the problem. */
-  nZone,                        /*!< \brief Number of zones in the problem. */
-  nMarker;                      /*!< \brief Number of different markers of the mesh. */
+  unsigned short nDim;	        /*!< \brief Number of dimension of the problem. */
+  unsigned short nZone;         /*!< \brief Number of zones in the problem. */
+  unsigned short nMarker;       /*!< \brief Number of different markers of the mesh. */
+  unsigned short nCommLevel;    /*!< \brief Number of non-blocking communication levels. */
 
-  unsigned short MGLevel;         /*!< \brief The mesh level index for the current geometry container. */
-  unsigned long Max_GlobalPoint;  /*!< \brief Greater global point in the domain local structure. */
+  unsigned short MGLevel;        /*!< \brief The mesh level index for the current geometry container. */
+  unsigned long Max_GlobalPoint; /*!< \brief Greater global point in the domain local structure. */
 
-  /* --- Custom boundary variables --- */
+  /*--- Boundary information. ---*/
+
+  short *Marker_All_SendRecv;   /*!< \brief MPI Marker. */
   su2double **CustomBoundaryTemperature;
   su2double **CustomBoundaryHeatFlux;
 
-public:
-  unsigned long *nElem_Bound;            /*!< \brief Number of elements of the boundary. */
-  string *Tag_to_Marker;                 /*!< \brief If you know the index of the boundary (depend of the grid definition),
-                                                     it gives you the maker (where the boundary is stored from 0 to boundaries). */
-  CPrimalGrid** elem;                    /*!< \brief Element vector (primal grid information). */
-  CPrimalGrid** face;                    /*!< \brief Face vector (primal grid information). */
-  CPrimalGrid*** bound;	                 /*!< \brief Boundary vector (primal grid information). */
-  CPoint** node;                         /*!< \brief Node vector (dual grid information). */
-  CEdge** edge;                          /*!< \brief Edge vector (dual grid information). */
-  CVertex*** vertex;                     /*!< \brief Boundary Vertex vector (dual grid information). */
-  CTurboVertex**** turbovertex;          /*!< \brief Boundary Vertex vector ordered for turbomachinery calculation(dual grid information). */
-  unsigned long *nVertex;                /*!< \brief Number of vertex for each marker. */
-  vector<bool> bound_is_straight;        /*!< \brief Bool if boundary-marker is straight(2D)/plane(3D) for each local marker. */
+  /*--- Create vectors and distribute the values among the different planes queues ---*/
+
+  vector<vector<su2double> > Xcoord_plane;     /*!< \brief Vector containing x coordinates of new points appearing on a single plane */
+  vector<vector<su2double> > Ycoord_plane;     /*!< \brief Vector containing y coordinates of new points appearing on a single plane */
+  vector<vector<su2double> > Zcoord_plane;     /*!< \brief Vector containing z coordinates of new points appearing on a single plane */
+  vector<vector<su2double> > FaceArea_plane;   /*!< \brief Vector containing area/volume associated with  new points appearing on a single plane */
+  vector<vector<unsigned long> > Plane_points; /*!< \brief Vector containing points appearing on a single plane */
+
+  vector<su2double> XCoordList;	  /*!< \brief Vector containing points appearing on a single plane */
+  CPrimalGrid*** newBound;        /*!< \brief Boundary vector for new periodic elements (primal grid information). */
+  unsigned long *nNewElem_Bound;  /*!< \brief Number of new periodic elements of the boundary. */
+
+#ifdef HAVE_MPI
+#ifdef HAVE_PARMETIS
+  vector<vector<unsigned long> > adj_nodes; /*!< \brief Vector of vectors holding each node's adjacency during preparation for ParMETIS. */
+  idx_t *adjacency; /*!< \brief Local adjacency array to be input into ParMETIS for partitioning (idx_t is a ParMETIS type defined in their headers). */
+  idx_t *xadj;      /*!< \brief Index array that points to the start of each node's adjacency in CSR format (needed to interpret the adjacency array).  */
+#endif
+#endif
+
+  /*--- Turbomachinery variables ---*/
+
   unsigned short *nSpanWiseSections;     /*!< \brief Number of Span wise section for each turbo marker, indexed by inflow/outflow */
   unsigned short *nSpanSectionsByMarker; /*!< \brief Number of Span wise section for each turbo marker, indexed by marker.  Needed for deallocation.*/
   unsigned short nTurboPerf;             /*!< \brief Number of Span wise section for each turbo marker. */
@@ -139,89 +152,102 @@ class CGeometry {
   su2double **MinAngularCoord;           /*!< \brief Max angular pitch at each span wise section for each marker.*/
   su2double **MinRelAngularCoord;        /*!< \brief Min relative angular coord at each span wise section for each marker.*/
   su2double **TurboRadius;               /*!< \brief Radius at each span wise section for each marker.*/
-  su2double **TangGridVelIn,
-  **TangGridVelOut;                      /*!< \brief Average tangential rotational speed at each span wise section for each turbomachinery marker.*/
-  su2double **SpanAreaIn,
-  **SpanAreaOut;                         /*!< \brief Area at each span wise section for each turbomachinery marker.*/
-  su2double **TurboRadiusIn,
-  **TurboRadiusOut;                      /*!< \brief Radius at each span wise section for each turbomachinery marker*/
+  su2double **TangGridVelIn;
+  su2double **TangGridVelOut;            /*!< \brief Average tangential rotational speed at each span wise section for each turbomachinery marker.*/
+  su2double **SpanAreaIn;
+  su2double **SpanAreaOut;               /*!< \brief Area at each span wise section for each turbomachinery marker.*/
+  su2double **TurboRadiusIn;
+  su2double **TurboRadiusOut;            /*!< \brief Radius at each span wise section for each turbomachinery marker*/
 
-  unsigned short nCommLevel;             /*!< \brief Number of non-blocking communication levels. */
+  /*--- Sparsity patterns associated with the geometry. ---*/
 
-  short *Marker_All_SendRecv;            /*!< \brief MPI Marker. */
+  CCompressedSparsePatternUL
+  finiteVolumeCSRFill0,                  /*!< \brief 0-fill FVM sparsity. */
+  finiteVolumeCSRFillN,                  /*!< \brief N-fill FVM sparsity (e.g. for ILUn preconditioner). */
+  finiteElementCSRFill0,                 /*!< \brief 0-fill FEM sparsity. */
+  finiteElementCSRFillN;                 /*!< \brief N-fill FEM sparsity (e.g. for ILUn preconditioner). */
 
-  /*--- Create vectors and distribute the values among the different planes queues ---*/
-  vector<vector<su2double> > Xcoord_plane;     /*!< \brief Vector containing x coordinates of new points appearing on a single plane */
-  vector<vector<su2double> > Ycoord_plane;     /*!< \brief Vector containing y coordinates of new points appearing on a single plane */
-  vector<vector<su2double> > Zcoord_plane;     /*!< \brief Vector containing z coordinates of new points appearing on a single plane */
-  vector<vector<su2double> > FaceArea_plane;   /*!< \brief Vector containing area/volume associated with  new points appearing on a single plane */
-  vector<vector<unsigned long> > Plane_points; /*!< \brief Vector containing points appearing on a single plane */
+  CEdgeToNonZeroMapUL edgeToCSRMap;      /*!< \brief Map edges to CSR entries referenced by them (i,j) and (j,i). */
 
-  vector<su2double> XCoordList;	  /*!< \brief Vector containing points appearing on a single plane */
-  CPrimalGrid*** newBound;        /*!< \brief Boundary vector for new periodic elements (primal grid information). */
-  unsigned long *nNewElem_Bound;  /*!< \brief Number of new periodic elements of the boundary. */
+  /*--- Edge and element colorings. ---*/
 
-  /*--- Partitioning-specific variables ---*/
+  CCompressedSparsePatternUL
+  edgeColoring,                          /*!< \brief Edge coloring structure for thread-based parallelization. */
+  elemColoring;                          /*!< \brief Element coloring structure for thread-based parallelization. */
+  unsigned long edgeColorGroupSize = 1;  /*!< \brief Size of the edge groups within each color. */
+  unsigned long elemColorGroupSize = 1;  /*!< \brief Size of the element groups within each color. */
 
-  map<unsigned long,unsigned long> Global_to_Local_Elem;  /*!< \brief Mapping of global to local index for elements. */
-  unsigned long *beg_node;                                /*!< \brief Array containing the first node on each rank due to a linear partitioning by global index. */
-  unsigned long *end_node;                                /*!< \brief Array containing the last node on each rank due to a linear partitioning by global index. */
-  unsigned long *nPointLinear;                            /*!< \brief Array containing the total number of nodes on each rank due to a linear partioning by global index. */
-  unsigned long *nPointCumulative;                        /*!< \brief Cumulative storage array containing the total number of points on all prior ranks in the linear partitioning. */
+public:
+  /*--- Main geometric elements of the grid. ---*/
 
-#ifdef HAVE_MPI
-#ifdef HAVE_PARMETIS
-  vector< vector<unsigned long> > adj_nodes; /*!< \brief Vector of vectors holding each node's adjacency during preparation for ParMETIS. */
-  idx_t *adjacency; /*!< \brief Local adjacency array to be input into ParMETIS for partitioning (idx_t is a ParMETIS type defined in their headers). */
-  idx_t *xadj;      /*!< \brief Index array that points to the start of each node's adjacency in CSR format (needed to interpret the adjacency array).  */
-#endif
-#endif
+  CPrimalGrid** elem;                    /*!< \brief Element vector (primal grid information). */
+  CPrimalGrid** face;                    /*!< \brief Face vector (primal grid information). */
+  CPrimalGrid*** bound;	                 /*!< \brief Boundary vector (primal grid information). */
+  CPoint** node;                         /*!< \brief Node vector (dual grid information). */
+  CEdge** edge;                          /*!< \brief Edge vector (dual grid information). */
+  CVertex*** vertex;                     /*!< \brief Boundary Vertex vector (dual grid information). */
+  CTurboVertex**** turbovertex;          /*!< \brief Boundary Vertex vector ordered for turbomachinery calculation(dual grid information). */
+  unsigned long *nVertex;                /*!< \brief Number of vertex for each marker. */
+  unsigned long *nElem_Bound;            /*!< \brief Number of elements of the boundary. */
+  string *Tag_to_Marker;                 /*!< \brief Names of boundary markers. */
+  vector<bool> bound_is_straight;        /*!< \brief Bool if boundary-marker is straight(2D)/plane(3D) for each local marker. */
+
+  /*--- Partitioning-specific variables ---*/
+
+  map<unsigned long,unsigned long> Global_to_Local_Elem; /*!< \brief Mapping of global to local index for elements. */
+  unsigned long *beg_node;           /*!< \brief Array containing the first node on each rank due to a linear partitioning by global index. */
+  unsigned long *end_node;           /*!< \brief Array containing the last node on each rank due to a linear partitioning by global index. */
+  unsigned long *nPointLinear;       /*!< \brief Array containing the total number of nodes on each rank due to a linear partioning by global index. */
+  unsigned long *nPointCumulative;   /*!< \brief Cumulative storage array containing the total number of points on all prior ranks in the linear partitioning. */
 
   /*--- Data structures for point-to-point MPI communications. ---*/
 
-  int countPerPoint;                  /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */
-  int nP2PSend;                       /*!< \brief Number of sends during point-to-point comms. */
-  int nP2PRecv;                       /*!< \brief Number of receives during point-to-point comms. */
-  int *nPoint_P2PSend;                /*!< \brief Data structure holding number of vertices for each send in point-to-point comms. */
-  int *nPoint_P2PRecv;                /*!< \brief Data structure holding number of vertices for each recv in point-to-point comms. */
-  int *Neighbors_P2PSend;             /*!< \brief Data structure holding the ranks of the neighbors for point-to-point send comms. */
-  int *Neighbors_P2PRecv;             /*!< \brief Data structure holding the ranks of the neighbors for point-to-point recv comms. */
-  map<int, int> P2PSend2Neighbor;     /*!< \brief Data structure holding the reverse mapping of the ranks of the neighbors for point-to-point send comms. */
-  map<int, int> P2PRecv2Neighbor;     /*!< \brief Data structure holding the reverse mapping of the ranks of the neighbors for point-to-point recv comms. */
-  unsigned long *Local_Point_P2PSend; /*!< \brief Data structure holding the local index of all vertices to be sent in point-to-point comms. */
-  unsigned long *Local_Point_P2PRecv; /*!< \brief Data structure holding the local index of all vertices to be received in point-to-point comms. */
-  su2double *bufD_P2PRecv;            /*!< \brief Data structure for su2double point-to-point receive. */
-  su2double *bufD_P2PSend;            /*!< \brief Data structure for su2double point-to-point send. */
-  unsigned short *bufS_P2PRecv;       /*!< \brief Data structure for unsigned long point-to-point receive. */
-  unsigned short *bufS_P2PSend;       /*!< \brief Data structure for unsigned long point-to-point send. */
-  SU2_MPI::Request *req_P2PSend;      /*!< \brief Data structure for point-to-point send requests. */
-  SU2_MPI::Request *req_P2PRecv;      /*!< \brief Data structure for point-to-point recv requests. */
+  int countPerPoint;                     /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */
+  int nP2PSend;                          /*!< \brief Number of sends during point-to-point comms. */
+  int nP2PRecv;                          /*!< \brief Number of receives during point-to-point comms. */
+  int *nPoint_P2PSend;                   /*!< \brief Data structure holding number of vertices for each send in point-to-point comms. */
+  int *nPoint_P2PRecv;                   /*!< \brief Data structure holding number of vertices for each recv in point-to-point comms. */
+  int *Neighbors_P2PSend;                /*!< \brief Data structure holding the ranks of the neighbors for point-to-point send comms. */
+  int *Neighbors_P2PRecv;                /*!< \brief Data structure holding the ranks of the neighbors for point-to-point recv comms. */
+  map<int, int> P2PSend2Neighbor;        /*!< \brief Data structure holding the reverse mapping of the ranks of the neighbors for point-to-point send comms. */
+  map<int, int> P2PRecv2Neighbor;        /*!< \brief Data structure holding the reverse mapping of the ranks of the neighbors for point-to-point recv comms. */
+  unsigned long *Local_Point_P2PSend;    /*!< \brief Data structure holding the local index of all vertices to be sent in point-to-point comms. */
+  unsigned long *Local_Point_P2PRecv;    /*!< \brief Data structure holding the local index of all vertices to be received in point-to-point comms. */
+  su2double *bufD_P2PRecv;               /*!< \brief Data structure for su2double point-to-point receive. */
+  su2double *bufD_P2PSend;               /*!< \brief Data structure for su2double point-to-point send. */
+  unsigned short *bufS_P2PRecv;          /*!< \brief Data structure for unsigned long point-to-point receive. */
+  unsigned short *bufS_P2PSend;          /*!< \brief Data structure for unsigned long point-to-point send. */
+  SU2_MPI::Request *req_P2PSend;         /*!< \brief Data structure for point-to-point send requests. */
+  SU2_MPI::Request *req_P2PRecv;         /*!< \brief Data structure for point-to-point recv requests. */
 
   /*--- Data structures for periodic communications. ---*/
 
-  int countPerPeriodicPoint;                /*!< \brief Maximum number of pieces of data sent per vertex in periodic comms. */
-  int nPeriodicSend;                        /*!< \brief Number of sends during periodic comms. */
-  int nPeriodicRecv;                        /*!< \brief Number of receives during periodic comms. */
-  int *nPoint_PeriodicSend;                 /*!< \brief Data structure holding number of vertices for each send in periodic comms. */
-  int *nPoint_PeriodicRecv;                 /*!< \brief Data structure holding number of vertices for each recv in periodic comms. */
-  int *Neighbors_PeriodicSend;              /*!< \brief Data structure holding the ranks of the neighbors for periodic send comms. */
-  int *Neighbors_PeriodicRecv;              /*!< \brief Data structure holding the ranks of the neighbors for periodic recv comms. */
-  map<int, int> PeriodicSend2Neighbor;      /*!< \brief Data structure holding the reverse mapping of the ranks of the neighbors for periodic send comms. */
-  map<int, int> PeriodicRecv2Neighbor;      /*!< \brief Data structure holding the reverse mapping of the ranks of the neighbors for periodic recv comms. */
-  unsigned long *Local_Point_PeriodicSend;  /*!< \brief Data structure holding the local index of all vertices to be sent in periodic comms. */
-  unsigned long *Local_Point_PeriodicRecv;  /*!< \brief Data structure holding the local index of all vertices to be received in periodic comms. */
-  unsigned long *Local_Marker_PeriodicSend; /*!< \brief Data structure holding the local index of the periodic marker for a particular vertex to be sent in periodic comms. */
-  unsigned long *Local_Marker_PeriodicRecv; /*!< \brief Data structure holding the local index of the periodic marker for a particular vertex to be received in periodic comms. */
-  su2double *bufD_PeriodicRecv;             /*!< \brief Data structure for su2double periodic receive. */
-  su2double *bufD_PeriodicSend;             /*!< \brief Data structure for su2double periodic send. */
-  unsigned short *bufS_PeriodicRecv;        /*!< \brief Data structure for unsigned long periodic receive. */
-  unsigned short *bufS_PeriodicSend;        /*!< \brief Data structure for unsigned long periodic send. */
-  SU2_MPI::Request *req_PeriodicSend;       /*!< \brief Data structure for periodic send requests. */
-  SU2_MPI::Request *req_PeriodicRecv;       /*!< \brief Data structure for periodic recv requests. */
-
-  vector<su2double> Orthogonality;          /*!< \brief Measure of dual CV orthogonality angle (0 to 90 deg., 90 being best). */
-  vector<su2double> Aspect_Ratio;           /*!< \brief Measure of dual CV aspect ratio (max face area / min face area).  */
-  vector<su2double> Volume_Ratio;           /*!< \brief Measure of dual CV volume ratio (max sub-element volume / min sub-element volume). */
+  int countPerPeriodicPoint;             /*!< \brief Maximum number of pieces of data sent per vertex in periodic comms. */
+  int nPeriodicSend;                     /*!< \brief Number of sends during periodic comms. */
+  int nPeriodicRecv;                     /*!< \brief Number of receives during periodic comms. */
+  int *nPoint_PeriodicSend;              /*!< \brief Data structure holding number of vertices for each send in periodic comms. */
+  int *nPoint_PeriodicRecv;              /*!< \brief Data structure holding number of vertices for each recv in periodic comms. */
+  int *Neighbors_PeriodicSend;           /*!< \brief Data structure holding the ranks of the neighbors for periodic send comms. */
+  int *Neighbors_PeriodicRecv;           /*!< \brief Data structure holding the ranks of the neighbors for periodic recv comms. */
+  map<int, int> PeriodicSend2Neighbor;   /*!< \brief Data structure holding the reverse mapping of the ranks of the neighbors for periodic send comms. */
+  map<int, int> PeriodicRecv2Neighbor;   /*!< \brief Data structure holding the reverse mapping of the ranks of the neighbors for periodic recv comms. */
+  unsigned long
+  *Local_Point_PeriodicSend,             /*!< \brief Data structure holding the local index of all vertices to be sent in periodic comms. */
+  *Local_Point_PeriodicRecv,             /*!< \brief Data structure holding the local index of all vertices to be received in periodic comms. */
+  *Local_Marker_PeriodicSend,            /*!< \brief Data structure holding the local index of the periodic marker for a particular vertex to be sent in periodic comms. */
+  *Local_Marker_PeriodicRecv;            /*!< \brief Data structure holding the local index of the periodic marker for a particular vertex to be received in periodic comms. */
+  su2double *bufD_PeriodicRecv;          /*!< \brief Data structure for su2double periodic receive. */
+  su2double *bufD_PeriodicSend;          /*!< \brief Data structure for su2double periodic send. */
+  unsigned short *bufS_PeriodicRecv;     /*!< \brief Data structure for unsigned long periodic receive. */
+  unsigned short *bufS_PeriodicSend;     /*!< \brief Data structure for unsigned long periodic send. */
+  SU2_MPI::Request *req_PeriodicSend;    /*!< \brief Data structure for periodic send requests. */
+  SU2_MPI::Request *req_PeriodicRecv;    /*!< \brief Data structure for periodic recv requests. */
+
+  /*--- Mesh quality metrics. ---*/
+
+  vector<su2double> Orthogonality;       /*!< \brief Measure of dual CV orthogonality angle (0 to 90 deg., 90 being best). */
+  vector<su2double> Aspect_Ratio;        /*!< \brief Measure of dual CV aspect ratio (max face area / min face area).  */
+  vector<su2double> Volume_Ratio;        /*!< \brief Measure of dual CV volume ratio (max sub-element volume / min sub-element volume). */
 
   /*!
    * \brief Constructor of the class.
@@ -348,10 +374,16 @@ class CGeometry {
   inline unsigned long GetnPointDomain(void) const {return nPointDomain;}
 
   /*!
-   * \brief Get number of elements.
-   * \return Number of elements.
+   * \brief Retrieve total number of nodes in a simulation across all processors (including halos).
+   * \return Total number of nodes in a simulation across all processors (including halos).
+   */
+  inline unsigned long GetGlobal_nPoint(void) const { return Global_nPoint; }
+
+  /*!
+   * \brief Retrieve total number of nodes in a simulation across all processors (excluding halos).
+   * \return Total number of nodes in a simulation across all processors (excluding halos).
    */
-  unsigned long GetnLine(void);
+  inline unsigned long GetGlobal_nPointDomain(void) const { return Global_nPointDomain; }
 
   /*!
    * \brief Get number of elements.
@@ -505,6 +537,12 @@ class CGeometry {
    */
   inline void SetnPointDomain(unsigned long val_npoint) { nPointDomain = val_npoint; }
 
+  /*!
+   * \brief Set the value of the total number of points globally in the simulation.
+   * \param[in] val_global_npoint - Global number of points in the mesh (excluding halos).
+   */
+  void SetGlobal_nPointDomain(unsigned long val_global_npoint) { Global_nPointDomain = val_global_npoint; }
+
   /*!
    * \brief Set the number of grid elements.
    * \param[in] val_nelem - Number of grid elements.
@@ -1025,118 +1063,100 @@ class CGeometry {
   inline virtual unsigned short GetGlobal_to_Local_Marker(unsigned short val_imarker) const { return 0; }
 
   /*!
-   * \brief A virtual member.
-   * \return Total number of nodes in a simulation across all processors (including halos).
-   */
-  inline virtual unsigned long GetGlobal_nPoint() const { return 0; }
-
-  /*!
-   * \brief A virtual member.
-   * \return Total number of nodes in a simulation across all processors (excluding halos).
-   */
-  inline virtual unsigned long GetGlobal_nPointDomain() const { return 0; }
-
-  /*!
-   * \brief A virtual member.
-   * \param[in] val_global_npoint - Global number of points in the mesh (excluding halos).
-   */
-  inline virtual void SetGlobal_nPointDomain(unsigned long val_global_npoint) {}
-
-  /*!
-   * \brief A virtual member.
+   * \brief Retrieve total number of elements in a simulation across all processors.
    * \return Total number of elements in a simulation across all processors.
    */
-  inline virtual unsigned long GetGlobal_nElem() const { return 0; }
+  inline unsigned long GetGlobal_nElem(void) const { return Global_nElem; }
 
   /*!
-   * \brief A virtual member.
+   * \brief  Retrieve total number of elements in a simulation across all processors (excluding halos).
    * \return Total number of elements in a simulation across all processors (excluding halos).
    */
-  inline virtual unsigned long GetGlobal_nElemDomain() const { return 0; }
+  inline unsigned long GetGlobal_nElemDomain(void) const { return Global_nElemDomain; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Retrieve total number of triangular elements in a simulation across all processors.
    * \return Total number of line elements in a simulation across all processors.
    */
-  inline virtual unsigned long GetGlobal_nElemLine() const { return 0; }
+  inline unsigned long GetGlobal_nElemLine(void) const { return Global_nelem_edge; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Retrieve total number of triangular elements in a simulation across all processors.
    * \return Total number of triangular elements in a simulation across all processors.
    */
-  inline virtual unsigned long GetGlobal_nElemTria() const { return 0; }
+  inline unsigned long GetGlobal_nElemTria(void) const { return Global_nelem_triangle; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Retrieve total number of quadrilateral elements in a simulation across all processors.
    * \return Total number of quadrilateral elements in a simulation across all processors.
    */
-  inline virtual unsigned long GetGlobal_nElemQuad() const { return 0; }
+  inline unsigned long GetGlobal_nElemQuad(void) const { return Global_nelem_quad; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Retrieve total number of tetrahedral elements in a simulation across all processors.
    * \return Total number of tetrahedral elements in a simulation across all processors.
    */
-  inline virtual unsigned long GetGlobal_nElemTetr() const { return 0; }
+  inline unsigned long GetGlobal_nElemTetr(void) const { return Global_nelem_tetra; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Retrieve total number of hexahedral elements in a simulation across all processors.
    * \return Total number of hexahedral elements in a simulation across all processors.
    */
-  inline virtual unsigned long GetGlobal_nElemHexa() const { return 0; }
+  inline unsigned long GetGlobal_nElemHexa(void) const { return Global_nelem_hexa; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Retrieve total number of prism elements in a simulation across all processors.
    * \return Total number of prism elements in a simulation across all processors.
    */
-  inline virtual unsigned long GetGlobal_nElemPris() const { return 0; }
+  inline unsigned long GetGlobal_nElemPris(void) const { return Global_nelem_prism; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Retrieve total number of pyramid elements in a simulation across all processors.
    * \return Total number of pyramid elements in a simulation across all processors.
    */
-  inline virtual unsigned long GetGlobal_nElemPyra() const { return 0; }
+  inline unsigned long GetGlobal_nElemPyra(void) const { return Global_nelem_pyramid; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get number of triangular elements.
    * \return Number of line elements.
    */
-  inline virtual unsigned long GetnElemLine() const { return 0; }
+  inline unsigned long GetnElemLine(void) const { return nelem_edge; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get number of triangular elements.
    * \return Number of triangular elements.
    */
-  inline virtual unsigned long GetnElemTria() const { return 0; }
+  inline unsigned long GetnElemTria(void) const { return nelem_triangle; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get number of quadrilateral elements.
    * \return Number of quadrilateral elements.
    */
-  inline virtual unsigned long GetnElemQuad() const { return 0; }
+  inline unsigned long GetnElemQuad(void) const { return nelem_quad; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get number of tetrahedral elements.
    * \return Number of tetrahedral elements.
    */
-  inline virtual unsigned long GetnElemTetr() const { return 0; }
+  inline unsigned long GetnElemTetr(void) const { return nelem_tetra; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get number of hexahedral elements.
    * \return Number of hexahedral elements.
    */
-  inline virtual unsigned long GetnElemHexa() const { return 0; }
+  inline unsigned long GetnElemHexa(void) const { return nelem_hexa; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get number of prism elements.
    * \return Number of prism elements.
    */
-  inline virtual unsigned long GetnElemPris() const { return 0; }
+  inline unsigned long GetnElemPris(void) const { return nelem_prism; }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get number of pyramid elements.
    * \return Number of pyramid elements.
    */
-  inline virtual unsigned long GetnElemPyra() const { return 0; }
+  inline unsigned long GetnElemPyra(void) const { return nelem_pyramid; }
 
   /*!
    * \brief Indentify geometrical planes in the mesh
@@ -1272,164 +1292,213 @@ class CGeometry {
   inline virtual void SetSensitivity(unsigned long iPoint, unsigned short iDim, su2double val) {}
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the average normal at a specific span for a given marker in the turbomachinery reference of frame.
    * \param[in] val_marker - marker value.
    * \param[in] val_span - span value.
+   * \return The span-wise averaged turbo normal.
    */
-  inline virtual const su2double* GetAverageTurboNormal(unsigned short val_marker, unsigned short val_span) const { return nullptr; }
+  inline const su2double* GetAverageTurboNormal(unsigned short val_marker, unsigned short val_span) const {
+    return AverageTurboNormal[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the average normal at a specific span for a given marker.
    * \param[in] val_marker - marker value.
    * \param[in] val_span - span value.
+   * \return The span-wise averaged normal.
    */
-  inline virtual const su2double* GetAverageNormal(unsigned short val_marker, unsigned short val_span) const { return nullptr; }
+  inline const su2double* GetAverageNormal(unsigned short val_marker, unsigned short val_span) const {
+    return AverageNormal[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the value of the total area for each span.
    * \param[in] val_marker - marker value.
    * \param[in] val_span - span value.
+   * \return The span-wise area.
    */
-  inline virtual su2double GetSpanArea(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetSpanArea(unsigned short val_marker, unsigned short val_span) const {
+    return SpanArea[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the value of the total area for each span.
    * \param[in] val_marker - marker value.
    * \param[in] val_span - span value.
+   * \return The span-wise averaged turbo normal.
    */
-  inline virtual su2double GetTurboRadius(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetTurboRadius(unsigned short val_marker, unsigned short val_span) const {
+    return TurboRadius[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the value of the average tangential rotational velocity for each span.
    * \param[in] val_marker - marker value.
    * \param[in] val_span - span value.
+   * \return The span-wise averaged tangential velocity.
    */
-  inline virtual su2double GetAverageTangGridVel(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetAverageTangGridVel(unsigned short val_marker, unsigned short val_span) const {
+    return AverageTangGridVel[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the value of the inflow tangential velocity at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    * \return The span-wise inflow tangential velocity.
    */
-  inline virtual su2double GetTangGridVelIn(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetTangGridVelIn(unsigned short val_marker, unsigned short val_span) const {
+    return TangGridVelIn[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the value of the outflow tangential velocity at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    * \return The span-wise outflow tangential velocity.
    */
-  inline virtual su2double GetTangGridVelOut(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetTangGridVelOut(unsigned short val_marker, unsigned short val_span) const {
+    return TangGridVelOut[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the value of the inflow area at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    * \return The span-wise inflow area.
    */
-  inline virtual su2double GetSpanAreaIn(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetSpanAreaIn(unsigned short val_marker, unsigned short val_span) const {
+    return SpanAreaIn[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the value of the outflow area at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    * \return The span-wise outflow area.
    */
-  inline virtual su2double GetSpanAreaOut(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetSpanAreaOut(unsigned short val_marker, unsigned short val_span) const {
+    return SpanAreaOut[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the value of the inflow radius at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    * \return The span-wise inflow radius.
    */
-  inline virtual su2double GetTurboRadiusIn(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetTurboRadiusIn(unsigned short val_marker, unsigned short val_span) const {
+    return TurboRadiusIn[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the value of the outflow radius at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    * \return The span-wise outflow radius.
    */
-  inline virtual su2double GetTurboRadiusOut(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetTurboRadiusOut(unsigned short val_marker, unsigned short val_span) const {
+    return TurboRadiusOut[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Set the value of the inflow tangential velocity at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    */
-  inline virtual void SetTangGridVelIn(su2double value, unsigned short val_marker, unsigned short val_span) {}
+  inline void SetTangGridVelIn(su2double value, unsigned short val_marker, unsigned short val_span) {
+    TangGridVelIn[val_marker][val_span] = value;
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Set the value of the outflow tangential velocity at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    */
-  inline virtual void SetTangGridVelOut(su2double value, unsigned short val_marker, unsigned short val_span) {}
+  inline void SetTangGridVelOut(su2double value, unsigned short val_marker, unsigned short val_span) {
+    TangGridVelOut[val_marker][val_span] = value;
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Set the value of the inflow area at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    */
-  inline virtual void SetSpanAreaIn(su2double value, unsigned short val_marker, unsigned short val_span) {}
+  inline void SetSpanAreaIn(su2double value, unsigned short val_marker, unsigned short val_span) {
+    SpanAreaIn[val_marker][val_span] = value;
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Set the value of the outflow area at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    */
-  inline virtual void SetSpanAreaOut(su2double value, unsigned short val_marker, unsigned short val_span) {}
+  inline void SetSpanAreaOut(su2double value, unsigned short val_marker, unsigned short val_span) {
+    SpanAreaOut[val_marker][val_span] = value;
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Set the value of the inflow radius at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    */
-  inline virtual void SetTurboRadiusIn(su2double value, unsigned short val_marker, unsigned short val_span) {}
+  inline void SetTurboRadiusIn(su2double value, unsigned short val_marker, unsigned short val_span) {
+    TurboRadiusIn[val_marker][val_span] = value;
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Set the value of the outflow radius at each span.
    * \param[in] val_marker - marker turbo-performance value.
    * \param[in] val_span - span value.
    */
-  inline virtual void SetTurboRadiusOut(su2double value, unsigned short val_marker, unsigned short val_span) {}
+  inline void SetTurboRadiusOut(su2double value, unsigned short val_marker, unsigned short val_span) {
+    TurboRadiusOut[val_marker][val_span] = value;
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief A total number of vertex independently from the MPI partions.
    * \param[in] val_marker - marker value.
    * \param[in] val_span - span value.
    */
-  inline virtual unsigned long GetnTotVertexSpan(unsigned short val_marker, unsigned short val_span) const {return 0;}
+  inline unsigned long GetnTotVertexSpan(unsigned short val_marker, unsigned short val_span) const {
+    return nTotVertexSpan[val_marker][val_span];
+  }
 
   /*!
- * \brief A virtual member.
- * \param[in] val_marker - marker value.
- * \param[in] val_span - span value.
- */
-  inline virtual su2double GetMinAngularCoord(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+   * \brief min angular pitch independently from the MPI partions.
+   * \param[in] val_marker - marker value.
+   * \param[in] val_span - span value.
+   */
+  inline su2double GetMinAngularCoord(unsigned short val_marker, unsigned short val_span) const {
+    return MinAngularCoord[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief max angular pitch independently from the MPI partions.
    * \param[in] val_marker - marker value.
    * \param[in] val_span - span value.
    */
-  inline virtual su2double GetMaxAngularCoord(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetMaxAngularCoord(unsigned short val_marker, unsigned short val_span) const {
+    return MaxAngularCoord[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief min Relatice angular coord independently from the MPI partions.
    * \param[in] val_marker - marker value.
    * \param[in] val_span - span value.
    */
-  inline virtual su2double GetMinRelAngularCoord(unsigned short val_marker, unsigned short val_span) const { return 0.0; }
+  inline su2double GetMinRelAngularCoord(unsigned short val_marker, unsigned short val_span) const {
+    return MinRelAngularCoord[val_marker][val_span];
+  }
 
   /*!
-   * \brief A virtual member.
+   * \brief Get the average grid velocity at a specific span for a given marker.
    * \param[in] val_marker - marker value.
    * \param[in] val_span - span value.
    */
-  inline virtual const su2double* GetAverageGridVel(unsigned short val_marker, unsigned short val_span) const {return nullptr;}
+  inline const su2double* GetAverageGridVel(unsigned short val_marker, unsigned short val_span) const {
+    return AverageGridVel[val_marker][val_span];
+  }
 
   /*!
    * \brief A virtual member.
@@ -1541,5 +1610,35 @@ class CGeometry {
    */
   inline virtual void ComputeMeshQualityStatistics(CConfig *config) {}
 
+  /*!
+   * \brief Get the sparse pattern of "type" with given level of fill.
+   * \note This method builds the pattern if that has not been done yet.
+   * \param[in] type - Finite volume or finite element.
+   * \param[in] fillLvl - Level of fill of the pattern.
+   * \return Reference to the sparse pattern.
+   */
+  const CCompressedSparsePatternUL& GetSparsePattern(ConnectivityType type, unsigned long fillLvl);
+
+  /*!
+   * \brief Get the edge to sparse pattern map.
+   * \note This method builds the map and required pattern (0-fill FVM) if that has not been done yet.
+   * \return Reference to the map.
+   */
+  const CEdgeToNonZeroMapUL& GetEdgeToSparsePatternMap(void);
+
+  /*!
+   * \brief Get the edge coloring.
+   * \note This method computes the coloring if that has not been done yet.
+   * \return Reference to the coloring.
+   */
+  const CCompressedSparsePatternUL& GetEdgeColoring(void);
+
+  /*!
+   * \brief Get the element coloring.
+   * \note This method computes the coloring if that has not been done yet.
+   * \return Reference to the coloring.
+   */
+  const CCompressedSparsePatternUL& GetElementColoring(void);
+
 };
 
diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp
index b2428e1b97d3..8db4b2d17025 100644
--- a/Common/include/geometry/CPhysicalGeometry.hpp
+++ b/Common/include/geometry/CPhysicalGeometry.hpp
@@ -29,6 +29,7 @@
 
 #include "CGeometry.hpp"
 #include "../CMeshReaderFVM.hpp"
+#include "../toolboxes/C2DContainer.hpp"
 
 /*!
  * \class CPhysicalGeometry
@@ -43,7 +44,7 @@ class CPhysicalGeometry final : public CGeometry {
   unsigned short *Global_to_Local_Marker;                   /*!< \brief Global to Local marker. */
   unsigned long *adj_counter;                               /*!< \brief Adjacency counter. */
   unsigned long **adjacent_elem;                            /*!< \brief Adjacency element list. */
-  su2double* Sensitivity;                                   /*!< \brief Vector holding the sensitivities at each point. */
+  su2activematrix Sensitivity;                              /*!< \brief Matrix holding the sensitivities at each point. */
 
   vector<vector<unsigned long> > Neighbors;
   map<unsigned long, unsigned long> Color_List;
@@ -618,114 +619,6 @@ class CPhysicalGeometry final : public CGeometry {
    */
   void FindNormal_Neighbor(CConfig *config) override;
 
-  /*!
-   * \brief Retrieve total number of nodes in a simulation across all processors (including halos).
-   * \return Total number of nodes in a simulation across all processors (including halos).
-   */
-  inline unsigned long GetGlobal_nPoint(void) const override { return Global_nPoint; }
-
-  /*!
-   * \brief Retrieve total number of nodes in a simulation across all processors (excluding halos).
-   * \return Total number of nodes in a simulation across all processors (excluding halos).
-   */
-  inline unsigned long GetGlobal_nPointDomain(void) const override { return Global_nPointDomain; }
-
-  /*!
-   * \brief Retrieve total number of elements in a simulation across all processors.
-   * \return Total number of elements in a simulation across all processors.
-   */
-  inline unsigned long GetGlobal_nElem(void) const override { return Global_nElem; }
-
-  /*!
-   * \brief  Retrieve total number of elements in a simulation across all processors (excluding halos).
-   * \return Total number of elements in a simulation across all processors (excluding halos).
-   */
-  inline unsigned long GetGlobal_nElemDomain(void) const override { return Global_nElemDomain; }
-
-  /*!
-   * \brief Retrieve total number of triangular elements in a simulation across all processors.
-   * \return Total number of line elements in a simulation across all processors.
-   */
-  inline unsigned long GetGlobal_nElemLine(void) const override { return Global_nelem_edge; }
-
-  /*!
-   * \brief Retrieve total number of triangular elements in a simulation across all processors.
-   * \return Total number of triangular elements in a simulation across all processors.
-   */
-  inline unsigned long GetGlobal_nElemTria(void) const override { return Global_nelem_triangle; }
-
-  /*!
-   * \brief Retrieve total number of quadrilateral elements in a simulation across all processors.
-   * \return Total number of quadrilateral elements in a simulation across all processors.
-   */
-  inline unsigned long GetGlobal_nElemQuad(void) const override { return Global_nelem_quad; }
-
-  /*!
-   * \brief Retrieve total number of tetrahedral elements in a simulation across all processors.
-   * \return Total number of tetrahedral elements in a simulation across all processors.
-   */
-  inline unsigned long GetGlobal_nElemTetr(void) const override { return Global_nelem_tetra; }
-
-  /*!
-   * \brief Retrieve total number of hexahedral elements in a simulation across all processors.
-   * \return Total number of hexahedral elements in a simulation across all processors.
-   */
-  inline unsigned long GetGlobal_nElemHexa(void) const override { return Global_nelem_hexa; }
-
-  /*!
-   * \brief Retrieve total number of prism elements in a simulation across all processors.
-   * \return Total number of prism elements in a simulation across all processors.
-   */
-  inline unsigned long GetGlobal_nElemPris(void) const override { return Global_nelem_prism; }
-
-  /*!
-   * \brief Retrieve total number of pyramid elements in a simulation across all processors.
-   * \return Total number of pyramid elements in a simulation across all processors.
-   */
-  inline unsigned long GetGlobal_nElemPyra(void) const override { return Global_nelem_pyramid; }
-
-  /*!
-   * \brief Get number of triangular elements.
-   * \return Number of line elements.
-   */
-  inline unsigned long GetnElemLine(void) const override { return nelem_edge; }
-
-  /*!
-   * \brief Get number of triangular elements.
-   * \return Number of triangular elements.
-   */
-  inline unsigned long GetnElemTria(void) const override { return nelem_triangle; }
-
-  /*!
-   * \brief Get number of quadrilateral elements.
-   * \return Number of quadrilateral elements.
-   */
-  inline unsigned long GetnElemQuad(void) const override { return nelem_quad; }
-
-  /*!
-   * \brief Get number of tetrahedral elements.
-   * \return Number of tetrahedral elements.
-   */
-  inline unsigned long GetnElemTetr(void) const override { return nelem_tetra; }
-
-  /*!
-   * \brief Get number of hexahedral elements.
-   * \return Number of hexahedral elements.
-   */
-  inline unsigned long GetnElemHexa(void) const override { return nelem_hexa; }
-
-  /*!
-   * \brief Get number of prism elements.
-   * \return Number of prism elements.
-   */
-  inline unsigned long GetnElemPris(void) const override { return nelem_prism; }
-
-  /*!
-   * \brief Get number of pyramid elements.
-   * \return Number of pyramid elements.
-   */
-  inline unsigned long GetnElemPyra(void) const override { return nelem_pyramid; }
-
   /*!
    * \brief Read the sensitivity from an input file.
    * \param[in] config - Definition of the particular problem.
@@ -878,7 +771,7 @@ class CPhysicalGeometry final : public CGeometry {
    * \param[in] iDim - The component of the dim. vector.
    * \return The sensitivity at point iPoint and dim. iDim.
    */
-  inline su2double GetSensitivity(unsigned long iPoint, unsigned short iDim) const override { return Sensitivity[iPoint*nDim+iDim]; }
+  inline su2double GetSensitivity(unsigned long iPoint, unsigned short iDim) const override { return Sensitivity(iPoint,iDim); }
 
   /*!
    * \brief Set the Sensitivity at a specific point.
@@ -886,7 +779,7 @@ class CPhysicalGeometry final : public CGeometry {
    * \param[in] iDim - The component of the dim. vector.
    * \param[in] val - Value of the sensitivity.
    */
-  inline void SetSensitivity(unsigned long iPoint, unsigned short iDim, su2double val) override {Sensitivity[iPoint*nDim+iDim] = val;}
+  inline void SetSensitivity(unsigned long iPoint, unsigned short iDim, su2double val) override { Sensitivity(iPoint,iDim) = val; }
 
   /*!
    * \brief Check the mesh for periodicity and deactivate multigrid if periodicity is found.
@@ -894,214 +787,4 @@ class CPhysicalGeometry final : public CGeometry {
    */
   void Check_Periodicity(CConfig *config) override;
 
-  /*!
-   * \brief Get the average normal at a specific span for a given marker in the turbomachinery reference of frame.
-   * \param[in] val_marker - marker value.
-   * \param[in] val_span - span value.
-   * \return The span-wise averaged turbo normal.
-   */
-  inline const su2double* GetAverageTurboNormal(unsigned short val_marker, unsigned short val_span) const override {
-    return AverageTurboNormal[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the average normal at a specific span for a given marker.
-   * \param[in] val_marker - marker value.
-   * \param[in] val_span - span value.
-   * \return The span-wise averaged normal.
-   */
-  inline const su2double* GetAverageNormal(unsigned short val_marker, unsigned short val_span) const override {
-    return AverageNormal[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the value of the total area for each span.
-   * \param[in] val_marker - marker value.
-   * \param[in] val_span - span value.
-   * \return The span-wise area.
-   */
-  inline su2double GetSpanArea(unsigned short val_marker, unsigned short val_span) const override {
-    return SpanArea[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the value of the total area for each span.
-   * \param[in] val_marker - marker value.
-   * \param[in] val_span - span value.
-   * \return The span-wise averaged turbo normal.
-   */
-  inline su2double GetTurboRadius(unsigned short val_marker, unsigned short val_span) const override {
-    return TurboRadius[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the value of the average tangential rotational velocity for each span.
-   * \param[in] val_marker - marker value.
-   * \param[in] val_span - span value.
-   * \return The span-wise averaged tangential velocity.
-   */
-  inline su2double GetAverageTangGridVel(unsigned short val_marker, unsigned short val_span) const override {
-    return AverageTangGridVel[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the value of the inflow tangential velocity at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   * \return The span-wise inflow tangential velocity.
-   */
-  inline su2double GetTangGridVelIn(unsigned short val_marker, unsigned short val_span) const override {
-    return TangGridVelIn[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the value of the outflow tangential velocity at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   * \return The span-wise outflow tangential velocity.
-   */
-  inline su2double GetTangGridVelOut(unsigned short val_marker, unsigned short val_span) const override {
-    return TangGridVelOut[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the value of the inflow area at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   * \return The span-wise inflow area.
-   */
-  inline su2double GetSpanAreaIn(unsigned short val_marker, unsigned short val_span) const override {
-    return SpanAreaIn[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the value of the outflow area at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   * \return The span-wise outflow area.
-   */
-  inline su2double GetSpanAreaOut(unsigned short val_marker, unsigned short val_span) const override {
-    return SpanAreaOut[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the value of the inflow radius at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   * \return The span-wise inflow radius.
-   */
-  inline su2double GetTurboRadiusIn(unsigned short val_marker, unsigned short val_span) const override {
-    return TurboRadiusIn[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the value of the outflow radius at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   * \return The span-wise outflow radius.
-   */
-  inline su2double GetTurboRadiusOut(unsigned short val_marker, unsigned short val_span) const override {
-    return TurboRadiusOut[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Set the value of the inflow tangential velocity at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   */
-  inline void SetTangGridVelIn(su2double value, unsigned short val_marker, unsigned short val_span) override {
-    TangGridVelIn[val_marker][val_span] = value;
-  }
-
-  /*!
-   * \brief Set the value of the outflow tangential velocity at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   */
-  inline void SetTangGridVelOut(su2double value, unsigned short val_marker, unsigned short val_span) override {
-    TangGridVelOut[val_marker][val_span] = value;
-  }
-
-  /*!
-   * \brief Get the value of the inflow area at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   */
-  inline void SetSpanAreaIn(su2double value, unsigned short val_marker, unsigned short val_span) override {
-    SpanAreaIn[val_marker][val_span] = value;
-  }
-
-  /*!
-   * \brief Set the value of the outflow area at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   */
-  inline void SetSpanAreaOut(su2double value, unsigned short val_marker, unsigned short val_span) override {
-    SpanAreaOut[val_marker][val_span] = value;
-  }
-
-  /*!
-   * \brief Set the value of the inflow radius at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   */
-  inline void SetTurboRadiusIn(su2double value, unsigned short val_marker, unsigned short val_span) override {
-    TurboRadiusIn[val_marker][val_span] = value;
-  }
-
-  /*!
-   * \brief Set the value of the outflow radius at each span.
-   * \param[in] val_marker - marker turbo-performance value.
-   * \param[in] val_span - span value.
-   */
-  inline void SetTurboRadiusOut(su2double value, unsigned short val_marker, unsigned short val_span) override {
-    TurboRadiusOut[val_marker][val_span] = value;
-  }
-
-  /*!
-   * \brief A total number of vertex independently from the MPI partions.
-   * \param[in] val_marker - marker value.
-   * \param[in] val_span - span value.
-   */
-  inline unsigned long GetnTotVertexSpan(unsigned short val_marker, unsigned short val_span) const override {
-    return nTotVertexSpan[val_marker][val_span];
-  }
-
-  /*!
-   * \brief min angular pitch independently from the MPI partions.
-   * \param[in] val_marker - marker value.
-   * \param[in] val_span - span value.
-   */
-  inline su2double GetMinAngularCoord(unsigned short val_marker, unsigned short val_span) const override {
-    return MinAngularCoord[val_marker][val_span];
-  }
-
-  /*!
-   * \brief max angular pitch independently from the MPI partions.
-   * \param[in] val_marker - marker value.
-   * \param[in] val_span - span value.
-   */
-  inline su2double GetMaxAngularCoord(unsigned short val_marker, unsigned short val_span) const override {
-    return MaxAngularCoord[val_marker][val_span];
-  }
-
-  /*!
-   * \brief min Relatice angular coord independently from the MPI partions.
-   * \param[in] val_marker - marker value.
-   * \param[in] val_span - span value.
-   */
-  inline su2double GetMinRelAngularCoord(unsigned short val_marker, unsigned short val_span) const override {
-    return MinRelAngularCoord[val_marker][val_span];
-  }
-
-  /*!
-   * \brief Get the average grid velocity at a specific span for a given marker.
-   * \param[in] val_marker - marker value.
-   * \param[in] val_span - span value.
-   */
-  inline const su2double* GetAverageGridVel(unsigned short val_marker, unsigned short val_span) const override {
-    return AverageGridVel[val_marker][val_span];
-  }
-
 };
-
diff --git a/Common/include/linear_algebra/CMatrixVectorProduct.hpp b/Common/include/linear_algebra/CMatrixVectorProduct.hpp
index 1458f4c01c59..712ca6f10747 100644
--- a/Common/include/linear_algebra/CMatrixVectorProduct.hpp
+++ b/Common/include/linear_algebra/CMatrixVectorProduct.hpp
@@ -7,7 +7,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -25,7 +25,7 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
  */
- 
+
 
 #pragma once
 
@@ -54,9 +54,8 @@
 template<class ScalarType>
 class CMatrixVectorProduct {
 public:
-  virtual ~CMatrixVectorProduct() = 0; ///< class destructor
-  virtual void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v)
-  const = 0; ///< matrix-vector product operation
+  virtual ~CMatrixVectorProduct() = 0;
+  virtual void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const = 0;
 };
 template<class ScalarType>
 CMatrixVectorProduct<ScalarType>::~CMatrixVectorProduct() {}
@@ -67,45 +66,37 @@ CMatrixVectorProduct<ScalarType>::~CMatrixVectorProduct() {}
  * \brief Specialization of matrix-vector product that uses CSysMatrix class
  */
 template<class ScalarType>
-class CSysMatrixVectorProduct : public CMatrixVectorProduct<ScalarType> {
+class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
 private:
-  CSysMatrix<ScalarType>* sparse_matrix; /*!< \brief pointer to matrix that defines the product. */
-  CGeometry* geometry;                   /*!< \brief pointer to matrix that defines the geometry. */
-  CConfig* config;                       /*!< \brief pointer to matrix that defines the config. */
-
-  /*!
-   * \brief Default constructor of the class
-   * \note This class cannot be default constructed as that would leave us with invalid pointers.
-   */
-  CSysMatrixVectorProduct();
+  const CSysMatrix<ScalarType>& matrix;  /*!< \brief pointer to matrix that defines the product. */
+  CGeometry* geometry;                   /*!< \brief geometry associated with the matrix. */
+  CConfig* config;                       /*!< \brief config of the problem. */
 
 public:
-
   /*!
    * \brief constructor of the class
    * \param[in] matrix_ref - matrix reference that will be used to define the products
    * \param[in] geometry_ref - geometry associated with the problem
    * \param[in] config_ref - config of the problem
    */
-  inline CSysMatrixVectorProduct(CSysMatrix<ScalarType> & matrix_ref,
-                                 CGeometry *geometry_ref, CConfig *config_ref) {
-    sparse_matrix = &matrix_ref;
-    geometry = geometry_ref;
-    config = config_ref;
-  }
+  inline CSysMatrixVectorProduct(const CSysMatrix<ScalarType> & matrix_ref,
+                                 CGeometry *geometry_ref, CConfig *config_ref) :
+    matrix(matrix_ref),
+    geometry(geometry_ref),
+    config(config_ref) {}
 
   /*!
-   * \brief destructor of the class
+   * \note This class cannot be default constructed as that would leave us with invalid pointers.
    */
-  ~CSysMatrixVectorProduct() {}
+  CSysMatrixVectorProduct() = delete;
 
   /*!
    * \brief operator that defines the CSysMatrix-CSysVector product
    * \param[in] u - CSysVector that is being multiplied by the sparse matrix
    * \param[out] v - CSysVector that is the result of the product
    */
-  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const {
-    sparse_matrix->MatrixVectorProduct(u, v, geometry, config);
+  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const override {
+    matrix.MatrixVectorProduct(u, v, geometry, config);
   }
 };
 
@@ -115,44 +106,36 @@ class CSysMatrixVectorProduct : public CMatrixVectorProduct<ScalarType> {
  * \brief Specialization of matrix-vector product that uses CSysMatrix class for transposed products
  */
 template<class ScalarType>
-class CSysMatrixVectorProductTransposed : public CMatrixVectorProduct<ScalarType> {
+class CSysMatrixVectorProductTransposed final : public CMatrixVectorProduct<ScalarType> {
 private:
-  CSysMatrix<ScalarType>* sparse_matrix; /*!< \brief pointer to matrix that defines the product. */
-  CGeometry* geometry;                   /*!< \brief pointer to matrix that defines the geometry. */
-  CConfig* config;                       /*!< \brief pointer to matrix that defines the config. */
-
-  /*!
-   * \brief Default constructor of the class
-   * \note This class cannot be default constructed as that would leave us with invalid pointers.
-   */
-  CSysMatrixVectorProductTransposed();
+  const CSysMatrix<ScalarType>& matrix;  /*!< \brief pointer to matrix that defines the product. */
+  CGeometry* geometry;                   /*!< \brief geometry associated with the matrix. */
+  CConfig* config;                       /*!< \brief config of the problem. */
 
 public:
-
   /*!
    * \brief constructor of the class
    * \param[in] matrix_ref - matrix reference that will be used to define the products
    * \param[in] geometry_ref - geometry associated with the problem
    * \param[in] config_ref - config of the problem
    */
-  inline CSysMatrixVectorProductTransposed(CSysMatrix<ScalarType> & matrix_ref,
-                                           CGeometry *geometry_ref, CConfig *config_ref) {
-    sparse_matrix = &matrix_ref;
-    geometry = geometry_ref;
-    config = config_ref;
-  }
+  inline CSysMatrixVectorProductTransposed(const CSysMatrix<ScalarType> & matrix_ref,
+                                           CGeometry *geometry_ref, CConfig *config_ref) :
+    matrix(matrix_ref),
+    geometry(geometry_ref),
+    config(config_ref) {}
 
   /*!
-   * \brief destructor of the class
+   * \note This class cannot be default constructed as that would leave us with invalid pointers.
    */
-  ~CSysMatrixVectorProductTransposed() {}
+  CSysMatrixVectorProductTransposed() = delete;
 
   /*!
    * \brief operator that defines the CSysMatrix-CSysVector product
    * \param[in] u - CSysVector that is being multiplied by the sparse matrix
    * \param[out] v - CSysVector that is the result of the product
    */
-  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const {
-    sparse_matrix->MatrixVectorProductTransposed(u, v, geometry, config);
+  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const override {
+    matrix.MatrixVectorProductTransposed(u, v, geometry, config);
   }
 };
diff --git a/Common/include/linear_algebra/CPastixWrapper.hpp b/Common/include/linear_algebra/CPastixWrapper.hpp
index d683e244403b..b9914ef1f63d 100644
--- a/Common/include/linear_algebra/CPastixWrapper.hpp
+++ b/Common/include/linear_algebra/CPastixWrapper.hpp
@@ -7,7 +7,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -30,14 +30,17 @@
 
 #ifdef HAVE_PASTIX
 
-#include "../config_structure.hpp"
-#include "../geometry/CGeometry.hpp"
-
 namespace PaStiX {
 extern "C" {
 #include <pastix.h>
 }
 }
+#include <vector>
+
+using namespace std;
+
+class CConfig;
+class CGeometry;
 
 /*!
  * \class CPastixWrapper
diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
index 7a55a6f662ce..dcc0a961dd61 100644
--- a/Common/include/linear_algebra/CPreconditioner.hpp
+++ b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -1,13 +1,13 @@
 ﻿/*!
  * \file CPreconditioner.hpp
- * \brief Headers for the classes related to linear preconditioner wrappers.
+ * \brief Classes related to linear preconditioner wrappers.
  *        The actual operations are currently implemented mostly by CSysMatrix.
  * \author F. Palacios, J. Hicken, T. Economon
  * \version 7.0.0 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -36,7 +36,7 @@
 
 /*!
  * \class CPreconditioner
- * \brief abstract base class for defining preconditioning operation
+ * \brief Abstract base class for defining a preconditioning operation.
  * \author J. Hicken.
  *
  * See the remarks regarding the CMatrixVectorProduct class. The same
@@ -45,9 +45,20 @@
 template<class ScalarType>
 class CPreconditioner {
 public:
-  virtual ~CPreconditioner() = 0; ///< class destructor
-  virtual void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v)
-  const = 0; ///< preconditioning operation
+  /*!
+   * \brief Destructor of the class
+   */
+  virtual ~CPreconditioner() = 0;
+
+  /*!
+   * \brief Overload of operator (), applies the preconditioner to "u" storing the result in "v".
+   */
+  virtual void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const = 0;
+
+  /*!
+   * \brief Generic "preprocessing" hook derived classes may implement to build the preconditioner.
+   */
+  virtual void Build() {}
 };
 template<class ScalarType>
 CPreconditioner<ScalarType>::~CPreconditioner() {}
@@ -55,238 +66,260 @@ CPreconditioner<ScalarType>::~CPreconditioner() {}
 
 /*!
  * \class CJacobiPreconditioner
- * \brief specialization of preconditioner that uses CSysMatrix class
+ * \brief Specialization of preconditioner that uses CSysMatrix class.
  */
 template<class ScalarType>
-class CJacobiPreconditioner : public CPreconditioner<ScalarType> {
+class CJacobiPreconditioner final : public CPreconditioner<ScalarType> {
 private:
-  CSysMatrix<ScalarType>* sparse_matrix; /*!< \brief pointer to matrix that defines the preconditioner. */
-  CGeometry* geometry;                   /*!< \brief pointer to matrix that defines the geometry. */
-  CConfig* config;                       /*!< \brief pointer to matrix that defines the config. */
-
-  /*!
-   * \brief Default constructor of the class
-   * \note This class cannot be default constructed as that would leave us with invalid pointers.
-   */
-  CJacobiPreconditioner();
+  CSysMatrix<ScalarType>& sparse_matrix; /*!< \brief Pointer to matrix that defines the preconditioner. */
+  CGeometry* geometry;                   /*!< \brief Pointer to geometry associated with the matrix. */
+  CConfig* config;                       /*!< \brief Pointer to problem configuration. */
+  bool transp;                           /*!< \brief If the transpose version of the preconditioner is required. */
 
 public:
-
   /*!
-   * \brief constructor of the class
-   * \param[in] matrix_ref - matrix reference that will be used to define the preconditioner
-   * \param[in] geometry_ref - geometry associated with the problem
-   * \param[in] config_ref - config of the problem
+   * \brief Constructor of the class.
+   * \param[in] matrix_ref - Matrix reference that will be used to define the preconditioner.
+   * \param[in] geometry_ref - Geometry associated with the problem.
+   * \param[in] config_ref - Config of the problem.
+   * \param[in] transposed - If the transpose version of the preconditioner is required.
    */
   inline CJacobiPreconditioner(CSysMatrix<ScalarType> & matrix_ref,
-                               CGeometry *geometry_ref, CConfig *config_ref) {
-    sparse_matrix = &matrix_ref;
+                               CGeometry *geometry_ref, CConfig *config_ref, bool transposed) :
+    sparse_matrix(matrix_ref)
+  {
+    if((geometry_ref == nullptr) || (config_ref == nullptr))
+      SU2_MPI::Error("Preconditioner needs to be built with valid references.", CURRENT_FUNCTION);
     geometry = geometry_ref;
     config = config_ref;
+    transp = transposed;
   }
 
   /*!
-   * \brief destructor of the class
+   * \note This class cannot be default constructed as that would leave us with invalid Pointers.
    */
-  ~CJacobiPreconditioner() {}
+  CJacobiPreconditioner() = delete;
 
   /*!
    * \brief operator that defines the preconditioner operation
    * \param[in] u - CSysVector that is being preconditioned
    * \param[out] v - CSysVector that is the result of the preconditioning
    */
-  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const {
-    sparse_matrix->ComputeJacobiPreconditioner(u, v, geometry, config);
+  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const override {
+    sparse_matrix.ComputeJacobiPreconditioner(u, v, geometry, config);
+  }
+
+  /*!
+   * \note Request the associated matrix to build the preconditioner.
+   */
+  inline void Build() override {
+    sparse_matrix.BuildJacobiPreconditioner(transp);
   }
 };
 
 
 /*!
  * \class CILUPreconditioner
- * \brief specialization of preconditioner that uses CSysMatrix class
+ * \brief Specialization of preconditioner that uses CSysMatrix class
  */
 template<class ScalarType>
-class CILUPreconditioner : public CPreconditioner<ScalarType> {
+class CILUPreconditioner final : public CPreconditioner<ScalarType> {
 private:
-  CSysMatrix<ScalarType>* sparse_matrix; /*!< \brief pointer to matrix that defines the preconditioner. */
-  CGeometry* geometry;                   /*!< \brief pointer to matrix that defines the geometry. */
-  CConfig* config;                       /*!< \brief pointer to matrix that defines the config. */
-
-  /*!
-   * \brief Default constructor of the class
-   * \note This class cannot be default constructed as that would leave us with invalid pointers.
-   */
-  CILUPreconditioner();
+  CSysMatrix<ScalarType>& sparse_matrix; /*!< \brief Pointer to matrix that defines the preconditioner. */
+  CGeometry* geometry;                   /*!< \brief Pointer to geometry associated with the matrix. */
+  CConfig* config;                       /*!< \brief Pointer to problem configuration. */
+  bool transp;                           /*!< \brief If the transpose version of the preconditioner is required. */
 
 public:
-
   /*!
-   * \brief constructor of the class
-   * \param[in] matrix_ref - matrix reference that will be used to define the preconditioner
-   * \param[in] geometry_ref - geometry associated with the problem
-   * \param[in] config_ref - config of the problem
+   * \brief Constructor of the class.
+   * \param[in] matrix_ref - Matrix reference that will be used to define the preconditioner.
+   * \param[in] geometry_ref - Geometry associated with the problem.
+   * \param[in] config_ref - Config of the problem.
+   * \param[in] transposed - If the transpose version of the preconditioner is required.
    */
   inline CILUPreconditioner(CSysMatrix<ScalarType> & matrix_ref,
-                            CGeometry *geometry_ref, CConfig *config_ref) {
-    sparse_matrix = &matrix_ref;
+                            CGeometry *geometry_ref, CConfig *config_ref, bool transposed) :
+    sparse_matrix(matrix_ref)
+  {
+    if((geometry_ref == nullptr) || (config_ref == nullptr))
+      SU2_MPI::Error("Preconditioner needs to be built with valid references.", CURRENT_FUNCTION);
     geometry = geometry_ref;
     config = config_ref;
+    transp = transposed;
   }
 
   /*!
-   * \brief destructor of the class
+   * \note This class cannot be default constructed as that would leave us with invalid Pointers.
    */
-  ~CILUPreconditioner() {}
+  CILUPreconditioner() = delete;
 
   /*!
-   * \brief operator that defines the preconditioner operation
-   * \param[in] u - CSysVector that is being preconditioned
-   * \param[out] v - CSysVector that is the result of the preconditioning
+   * \brief Operator that defines the preconditioner operation.
+   * \param[in] u - CSysVector that is being preconditioned.
+   * \param[out] v - CSysVector that is the result of the preconditioning.
    */
-  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const {
-    sparse_matrix->ComputeILUPreconditioner(u, v, geometry, config);
+  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const override {
+    sparse_matrix.ComputeILUPreconditioner(u, v, geometry, config);
+  }
+
+  /*!
+   * \note Request the associated matrix to build the preconditioner.
+   */
+  inline void Build() override {
+    sparse_matrix.BuildILUPreconditioner(transp);
   }
 };
 
 
 /*!
  * \class CLU_SGSPreconditioner
- * \brief specialization of preconditioner that uses CSysMatrix class
+ * \brief Specialization of preconditioner that uses CSysMatrix class.
  */
 template<class ScalarType>
-class CLU_SGSPreconditioner : public CPreconditioner<ScalarType> {
+class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
 private:
-  CSysMatrix<ScalarType>* sparse_matrix; /*!< \brief pointer to matrix that defines the preconditioner. */
-  CGeometry* geometry;                   /*!< \brief pointer to matrix that defines the geometry. */
-  CConfig* config;                       /*!< \brief pointer to matrix that defines the config. */
-
-  /*!
-   * \brief Default constructor of the class
-   * \note This class cannot be default constructed as that would leave us with invalid pointers.
-   */
-  CLU_SGSPreconditioner();
+  CSysMatrix<ScalarType>& sparse_matrix; /*!< \brief Pointer to matrix that defines the preconditioner. */
+  CGeometry* geometry;                   /*!< \brief Pointer to geometry associated with the matrix. */
+  CConfig* config;                       /*!< \brief Pointer to problem configuration. */
 
 public:
 
   /*!
-   * \brief constructor of the class
-   * \param[in] matrix_ref - matrix reference that will be used to define the preconditioner
-   * \param[in] geometry_ref - geometry associated with the problem
-   * \param[in] config_ref - config of the problem
+   * \brief Constructor of the class.
+   * \param[in] matrix_ref - Matrix reference that will be used to define the preconditioner.
+   * \param[in] geometry_ref - Geometry associated with the problem.
+   * \param[in] config_ref - Config of the problem.
    */
   inline CLU_SGSPreconditioner(CSysMatrix<ScalarType> & matrix_ref,
-                               CGeometry *geometry_ref, CConfig *config_ref) {
-    sparse_matrix = &matrix_ref;
+                               CGeometry *geometry_ref, CConfig *config_ref) :
+    sparse_matrix(matrix_ref)
+  {
+    if((geometry_ref == nullptr) || (config_ref == nullptr))
+      SU2_MPI::Error("Preconditioner needs to be built with valid references.", CURRENT_FUNCTION);
     geometry = geometry_ref;
     config = config_ref;
   }
 
   /*!
-   * \brief destructor of the class
+   * \note This class cannot be default constructed as that would leave us with invalid Pointers.
    */
-  ~CLU_SGSPreconditioner() {}
+  CLU_SGSPreconditioner() = delete;
 
   /*!
-   * \brief operator that defines the preconditioner operation
-   * \param[in] u - CSysVector that is being preconditioned
-   * \param[out] v - CSysVector that is the result of the preconditioning
+   * \brief operator that defines the preconditioner operation.
+   * \param[in] u - CSysVector that is being preconditioned.
+   * \param[out] v - CSysVector that is the result of the preconditioning.
    */
-  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const {
-    sparse_matrix->ComputeLU_SGSPreconditioner(u, v, geometry, config);
+  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const override {
+    sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
   }
 };
 
 
 /*!
  * \class CLineletPreconditioner
- * \brief specialization of preconditioner that uses CSysMatrix class
+ * \brief Specialization of preconditioner that uses CSysMatrix class.
  */
 template<class ScalarType>
-class CLineletPreconditioner : public CPreconditioner<ScalarType> {
+class CLineletPreconditioner final : public CPreconditioner<ScalarType> {
 private:
-  CSysMatrix<ScalarType>* sparse_matrix; /*!< \brief pointer to matrix that defines the preconditioner. */
-  CGeometry* geometry;                   /*!< \brief pointer to matrix that defines the geometry. */
-  CConfig* config;                       /*!< \brief pointer to matrix that defines the config. */
-
-  /*!
-   * \brief Default constructor of the class
-   * \note This class cannot be default constructed as that would leave us with invalid pointers.
-   */
-  CLineletPreconditioner();
+  CSysMatrix<ScalarType>& sparse_matrix; /*!< \brief Pointer to matrix that defines the preconditioner. */
+  CGeometry* geometry;                   /*!< \brief Pointer to geometry associated with the matrix. */
+  CConfig* config;                       /*!< \brief Pointer to problem configuration. */
 
 public:
-
   /*!
-   * \brief constructor of the class
-   * \param[in] matrix_ref - matrix reference that will be used to define the preconditioner
-   * \param[in] geometry_ref - geometry associated with the problem
-   * \param[in] config_ref - config of the problem
+   * \brief Constructor of the class.
+   * \param[in] matrix_ref - Matrix reference that will be used to define the preconditioner.
+   * \param[in] geometry_ref - Geometry associated with the problem.
+   * \param[in] config_ref - Config of the problem.
    */
   inline CLineletPreconditioner(CSysMatrix<ScalarType> & matrix_ref,
-                                CGeometry *geometry_ref, CConfig *config_ref) {
-    sparse_matrix = &matrix_ref;
+                                CGeometry *geometry_ref, CConfig *config_ref) :
+    sparse_matrix(matrix_ref)
+  {
+    if((geometry_ref == nullptr) || (config_ref == nullptr))
+      SU2_MPI::Error("Preconditioner needs to be built with valid references.", CURRENT_FUNCTION);
     geometry = geometry_ref;
     config = config_ref;
   }
 
   /*!
-   * \brief destructor of the class
+   * \note This class cannot be default constructed as that would leave us with invalid Pointers.
    */
-  ~CLineletPreconditioner() {}
+  CLineletPreconditioner() = delete;
 
   /*!
-   * \brief operator that defines the preconditioner operation
-   * \param[in] u - CSysVector that is being preconditioned
-   * \param[out] v - CSysVector that is the result of the preconditioning
+   * \brief Operator that defines the preconditioner operation.
+   * \param[in] u - CSysVector that is being preconditioned.
+   * \param[out] v - CSysVector that is the result of the preconditioning.
    */
-  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const {
-    sparse_matrix->ComputeLineletPreconditioner(u, v, geometry, config);
+  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const override {
+    sparse_matrix.ComputeLineletPreconditioner(u, v, geometry, config);
+  }
+
+  /*!
+   * \note Request the associated matrix to build the preconditioner.
+   */
+  inline void Build() override {
+    sparse_matrix.BuildJacobiPreconditioner(false);
   }
 };
 
 
 /*!
  * \class CPastixPreconditioner
- * \brief Specialization of preconditioner that uses PaStiX to factorize a CSysMatrix
+ * \brief Specialization of preconditioner that uses PaStiX to factorize a CSysMatrix.
  */
 template<class ScalarType>
-class CPastixPreconditioner : public CPreconditioner<ScalarType> {
+class CPastixPreconditioner final : public CPreconditioner<ScalarType> {
 private:
-  CSysMatrix<ScalarType>* sparse_matrix; /*!< \brief Pointer to the matrix. */
+  CSysMatrix<ScalarType>& sparse_matrix; /*!< \brief Pointer to the matrix. */
   CGeometry* geometry;                   /*!< \brief Geometry associated with the problem. */
   CConfig* config;                       /*!< \brief Configuration of the problem. */
+  unsigned short kind_fact;              /*!< \brief The type of factorization desired. */
+  bool transp;                           /*!< \brief If the transpose version of the preconditioner is required. */
 
 public:
-
   /*!
    * \brief Constructor of the class
-   * \param[in] matrix_ref - Matrix reference that will be used to define the preconditioner
-   * \param[in] geometry_ref - Associated geometry
-   * \param[in] config_ref - Problem configuration
+   * \param[in] matrix_ref - Matrix reference that will be used to define the preconditioner.
+   * \param[in] geometry_ref - Associated geometry.
+   * \param[in] config_ref - Problem configuration.
+   * \param[in] kind_factorization - Type of factorization required.
+   * \param[in] transposed - If the transpose version of the preconditioner is required.
    */
-  inline CPastixPreconditioner(CSysMatrix<ScalarType> & matrix_ref,
-                               CGeometry *geometry_ref, CConfig *config_ref) {
-    sparse_matrix = &matrix_ref;
+  inline CPastixPreconditioner(CSysMatrix<ScalarType> & matrix_ref, CGeometry *geometry_ref,
+                               CConfig *config_ref, unsigned short kind_factorization, bool transposed) :
+    sparse_matrix(matrix_ref)
+  {
+    if((geometry_ref == nullptr) || (config_ref == nullptr))
+      SU2_MPI::Error("Preconditioner needs to be built with valid references.", CURRENT_FUNCTION);
     geometry = geometry_ref;
     config = config_ref;
+    kind_fact = kind_factorization;
+    transp = transposed;
   }
 
   /*!
-   * \brief Destructor of the class
+   * \note This class cannot be default constructed as that would leave us with invalid Pointers.
    */
-  ~CPastixPreconditioner() {}
+  CPastixPreconditioner() = delete;
 
   /*!
-   * \brief Operator that defines the preconditioner operation
-   * \param[in] u - CSysVector that is being preconditioned
-   * \param[out] v - CSysVector that is the result of the preconditioning
+   * \brief Operator that defines the preconditioner operation.
+   * \param[in] u - CSysVector that is being preconditioned.
+   * \param[out] v - CSysVector that is the result of the preconditioning.
+   */
+  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const override {
+    sparse_matrix.ComputePastixPreconditioner(u, v, geometry, config);
+  }
+
+  /*!
+   * \note Request the associated matrix to build the preconditioner.
    */
-  inline void operator()(const CSysVector<ScalarType> & u, CSysVector<ScalarType> & v) const {
-    if (sparse_matrix == NULL) {
-      cerr << "CPastixPreconditioner::operator()(const CSysVector &, CSysVector &): " << endl;
-      cerr << "pointer to sparse matrix is NULL." << endl;
-      throw(-1);
-    }
-    sparse_matrix->ComputePastixPreconditioner(u, v, geometry, config);
+  inline void Build() override {
+    sparse_matrix.BuildPastixPreconditioner(geometry, config, kind_fact, transp);
   }
 };
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index cebdc1f32293..0cd91f75ed18 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -1,7 +1,7 @@
 /*!
- * \file matrix_structure.hpp
- * \brief Headers of the main subroutines for creating the sparse matrices-by-blocks.
- *        The subroutines and functions are in the <i>matrix_structure.cpp</i> file.
+ * \file CSysMatrix.hpp
+ * \brief Declaration of the block-sparse matrix class.
+ *        The implemtation is in <i>CSysMatrix.cpp</i>.
  * \author F. Palacios, A. Bueno, T. Economon
  * \version 7.0.0 "Blackbird"
  *
@@ -28,23 +28,22 @@
 
 #pragma once
 
-#include "../mpi_structure.hpp"
-#include <limits>
-#include <iostream>
-#include <cmath>
-#include <cstdlib>
-
-#include "../config_structure.hpp"
-#include "../geometry/CGeometry.hpp"
+#include "../../include/mpi_structure.hpp"
 #include "CSysVector.hpp"
 #include "CPastixWrapper.hpp"
 
+#include <cstdlib>
+#include <vector>
+
+using namespace std;
+
+/*--- In forward mode the matrix is not of a built-in type. ---*/
 #if defined(HAVE_MKL) && !defined(CODI_FORWARD_TYPE)
 #include "mkl.h"
 #ifndef __INTEL_MKL__
   #error Could not determine the MKL version
 #endif
-/*--- JIT is only available since 2019 ---*/
+/*--- JIT is only available since 2019. ---*/
 #if __INTEL_MKL__ >= 2019
 #define USE_MKL
 /*---
@@ -59,50 +58,61 @@
 #endif
 #endif
 
-using namespace std;
-
-const su2double eps = numeric_limits<passivedouble>::epsilon(); /*!< \brief machine epsilon */
-
+class CConfig;
+class CGeometry;
 
 /*!
  * \class CSysMatrix
- * \brief Main class for defining sparse matrices-by-blocks
- with compressed row format.
+ * \brief Main class for defining block-compressed-row-storage sparse matrices.
  * \author A. Bueno, F. Palacios
  */
 template<class ScalarType>
 class CSysMatrix {
 private:
+  /*--- We are friends with all other possible CSysMatrices. ---*/
+  template<class T> friend class CSysMatrix;
+
   int rank;     /*!< \brief MPI Rank. */
   int size;     /*!< \brief MPI Size. */
-  unsigned long nPoint,   /*!< \brief Number of points in the grid. */
-  nPointDomain,           /*!< \brief Number of points in the grid. */
-  nVar,                   /*!< \brief Number of variables. */
-  nEqn;                   /*!< \brief Number of equations. */
-  ScalarType *matrix;            /*!< \brief Entries of the sparse matrix. */
-  ScalarType *ILU_matrix;         /*!< \brief Entries of the ILU sparse matrix. */
-  unsigned long nnz;                 /*!< \brief Number of possible nonzero entries in the matrix. */
-  unsigned long *row_ptr;            /*!< \brief Pointers to the first element in each row. */
-  unsigned long *col_ind;            /*!< \brief Column index for each of the elements in val(). */
-  unsigned long nnz_ilu;             /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
-  unsigned long *row_ptr_ilu;        /*!< \brief Pointers to the first element in each row (ILU). */
-  unsigned long *col_ind_ilu;        /*!< \brief Column index for each of the elements in val() (ILU). */
-  unsigned short ilu_fill_in;        /*!< \brief Fill in level for the ILU preconditioner. */
-
-  ScalarType *block;             /*!< \brief Internal array to store a subblock of the matrix. */
-  ScalarType *block_inverse;     /*!< \brief Internal array to store a subblock of the matrix. */
-  ScalarType *block_weight;      /*!< \brief Internal array to store a subblock of the matrix. */
-  ScalarType *prod_row_vector;   /*!< \brief Internal array to store the product of a matrix-by-blocks "row" with a vector. */
-  ScalarType *aux_vector;        /*!< \brief Auxiliary array to store intermediate results. */
-  ScalarType *sum_vector;        /*!< \brief Auxiliary array to store intermediate results. */
-  ScalarType *invM;              /*!< \brief Inverse of (Jacobi) preconditioner, or diagonal of ILU. */
-
-  unsigned long nLinelet;                       /*!< \brief Number of Linelets in the system. */
-  vector<bool> LineletBool;                     /*!< \brief Identify if a point belong to a Linelet. */
-  vector<vector<unsigned long> > LineletPoint;  /*!< \brief Linelet structure. */
-  vector<const ScalarType*> LineletUpper;       /*!< \brief Pointers to the upper blocks of the tri-diag system. */
-  vector<ScalarType> LineletInvDiag;            /*!< \brief Inverse of the diagonal blocks of the tri-diag system. */
-  vector<ScalarType> LineletVector;             /*!< \brief Solution and RHS of the tri-diag system. */
+
+  enum : size_t { MAXNVAR = 8 };    /*!< \brief Maximum number of variables the matrix can handle. The static
+                                                size is needed for fast, per-thread, static memory allocation. */
+
+  enum { OMP_MAX_SIZE_L = 8192 };   /*!< \brief Max. chunk size used in light parallel for loops. */
+  enum { OMP_MAX_SIZE_H = 512 };    /*!< \brief Max. chunk size used in heavy parallel for loops. */
+  unsigned long omp_light_size;     /*!< \brief Actual chunk size used in light loops (e.g. over non zeros). */
+  unsigned long omp_heavy_size;     /*!< \brief Actual chunk size used in heavy loops (e.g. over rows). */
+  unsigned long omp_num_parts;      /*!< \brief Number of threads used in thread-parallel LU_SGS and ILU. */
+  unsigned long *omp_partitions;    /*!< \brief Point indexes of LU_SGS and ILU thread-parallel sub partitioning. */
+
+  unsigned long nPoint;             /*!< \brief Number of points in the grid. */
+  unsigned long nPointDomain;       /*!< \brief Number of points in the grid (excluding halos). */
+  unsigned long nVar;               /*!< \brief Number of variables. */
+  unsigned long nEqn;               /*!< \brief Number of equations. */
+
+  ScalarType *matrix;               /*!< \brief Entries of the sparse matrix. */
+  unsigned long nnz;                /*!< \brief Number of possible nonzero entries in the matrix. */
+  const unsigned long *row_ptr;     /*!< \brief Pointers to the first element in each row. */
+  const unsigned long *dia_ptr;     /*!< \brief Pointers to the diagonal element in each row. */
+  const unsigned long *col_ind;     /*!< \brief Column index for each of the elements in val(). */
+
+  ScalarType *ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
+  unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
+  const unsigned long *row_ptr_ilu; /*!< \brief Pointers to the first element in each row (ILU). */
+  const unsigned long *dia_ptr_ilu; /*!< \brief Pointers to the diagonal element in each row (ILU). */
+  const unsigned long *col_ind_ilu; /*!< \brief Column index for each of the elements in val() (ILU). */
+  unsigned short ilu_fill_in;       /*!< \brief Fill in level for the ILU preconditioner. */
+
+  ScalarType *invM;                 /*!< \brief Inverse of (Jacobi) preconditioner, or diagonal of ILU. */
+
+  unsigned long nLinelet;                      /*!< \brief Number of Linelets in the system. */
+  vector<bool> LineletBool;                    /*!< \brief Identify if a point belong to a Linelet. */
+  vector<vector<unsigned long> > LineletPoint; /*!< \brief Linelet structure. */
+
+  /*--- Temporary (hence mutable) working memory used in the Linelet preconditioner, outer vector is for threads ---*/
+  mutable vector<vector<const ScalarType*> > LineletUpper; /*!< \brief Pointers to the upper blocks of the tri-diag system (working memory). */
+  mutable vector<vector<ScalarType> > LineletInvDiag;      /*!< \brief Inverse of the diagonal blocks of the tri-diag system (working memory). */
+  mutable vector<vector<ScalarType> > LineletVector;       /*!< \brief Solution and RHS of the tri-diag system (working memory). */
 
 #ifdef USE_MKL
   void * MatrixMatrixProductJitter;                            /*!< \brief Jitter handle for MKL JIT based GEMM. */
@@ -115,13 +125,26 @@ class CSysMatrix {
   dgemm_jit_kernel_t MatrixVectorProductKernelAlphaMinusOne;   /*!< \brief MKL JIT based GEMV kernel with ALPHA=-1.0 and BETA=1.0. */
   void * MatrixVectorProductTranspJitterBetaOne;               /*!< \brief Jitter handle for MKL JIT based GEMV (transposed) with BETA=1.0. */
   dgemm_jit_kernel_t MatrixVectorProductTranspKernelBetaOne;   /*!< \brief MKL JIT based GEMV (transposed) kernel with BETA=1.0. */
-  lapack_int * mkl_ipiv;
 #endif
 
 #ifdef HAVE_PASTIX
-  CPastixWrapper pastix_wrapper;
+  mutable CPastixWrapper pastix_wrapper;
 #endif
 
+  /*!
+   * \brief Auxilary object to wrap the edge map pointer used in fast block updates, i.e. without linear searches.
+   */
+  struct {
+    const unsigned long *ptr = nullptr;
+
+    inline unsigned long operator() (unsigned long edge, unsigned long node) const {
+      return ptr[2*edge+node];
+    }
+    inline unsigned long ij(unsigned long edge) const { return ptr[2*edge]; }
+    inline unsigned long ji(unsigned long edge) const { return ptr[2*edge+1]; }
+
+  } edge_ptr;
+
   /*!
    * \brief Handle type conversion for when we Set, Add, etc. blocks, preserving derivative information (if supported by types).
    * \note See specializations for discrete adjoint right outside this class's declaration.
@@ -141,36 +164,13 @@ class CSysMatrix {
 #endif
   }
 
-  /*!
-   * \brief Assigns values to the sparse-matrix structure (used in Initialize).
-   * \param[in] val_nPoint - Number of points in the nPoint x nPoint block structure
-   * \param[in] val_nVar - Number of nVar x nVar variables in each subblock of the matrix-by-block structure.
-   * \param[in] val_nEq - Number of nEqn x nVar variables in each subblock of the matrix-by-block structure.
-   * \param[in] val_row_ptr - Pointers to the first element in each row.
-   * \param[in] val_col_ind - Column index for each of the elements in val().
-   * \param[in] val_nnz - Number of possible nonzero entries in the matrix.
-   * \param[in] config - Definition of the particular problem.
-   */
-  void SetIndexes(unsigned long val_nPoint, unsigned long val_nPointDomain, unsigned short val_nVar, unsigned short val_nEq, unsigned long* val_row_ptr, unsigned long* val_col_ind, unsigned long val_nnz, CConfig *config);
-
-  /*!
-   * \brief Assigns values to the sparse-matrix structure (used in Initialize).
-   * \param[in] geometry - Geometrical definition of the problem.
-   * \param[in] iPoint - Base point to compute neighbours.
-   * \param[in] deep_level - Deep level for the recursive algorithm.
-   * \param[in] fill_level - ILU fill in level.
-   * \param[in] EdgeConnect - There is (or not) an edge structure).
-   * \param[in] vneighs - Storage the neighbours points to iPoint.
-   */
-  void SetNeighbours(CGeometry *geometry, unsigned long iPoint, unsigned short deep_level, unsigned short fill_level, bool EdgeConnect, vector<unsigned long> & vneighs);
-
   /*!
    * \brief Calculates the matrix-vector product: product = matrix*vector
    * \param[in] matrix
    * \param[in] vector
    * \param[out] product
    */
-  inline void MatrixVectorProduct(const ScalarType *matrix, const ScalarType *vector, ScalarType *product);
+  inline void MatrixVectorProduct(const ScalarType *matrix, const ScalarType *vector, ScalarType *product) const;
 
   /*!
    * \brief Calculates the matrix-vector product: product += matrix*vector
@@ -178,7 +178,7 @@ class CSysMatrix {
    * \param[in] vector
    * \param[in,out] product
    */
-  inline void MatrixVectorProductAdd(const ScalarType *matrix, const ScalarType *vector, ScalarType *product);
+  inline void MatrixVectorProductAdd(const ScalarType *matrix, const ScalarType *vector, ScalarType *product) const;
 
   /*!
    * \brief Calculates the matrix-vector product: product -= matrix*vector
@@ -186,7 +186,7 @@ class CSysMatrix {
    * \param[in] vector
    * \param[in,out] product
    */
-  inline void MatrixVectorProductSub(const ScalarType *matrix, const ScalarType *vector, ScalarType *product);
+  inline void MatrixVectorProductSub(const ScalarType *matrix, const ScalarType *vector, ScalarType *product) const;
 
   /*!
    * \brief Calculates the matrix-vector product: product += matrix^T * vector
@@ -194,20 +194,17 @@ class CSysMatrix {
    * \param[in] vector
    * \param[in,out] product
    */
-  inline void MatrixVectorProductTransp(const ScalarType *matrix, const ScalarType *vector, ScalarType *product);
+  inline void MatrixVectorProductTransp(const ScalarType *matrix, const ScalarType *vector, ScalarType *product) const;
 
   /*!
    * \brief Calculates the matrix-matrix product
-   * \param[in] matrix_a
-   * \param[in] matrix_b
-   * \param[out] product
    */
-  inline void MatrixMatrixProduct(const ScalarType *matrix_a, const ScalarType *matrix_b, ScalarType *product);
+  inline void MatrixMatrixProduct(const ScalarType *matrix_a, const ScalarType *matrix_b, ScalarType *product) const;
 
   /*!
    * \brief Subtract b from a and store the result in c.
    */
-  inline void VectorSubtraction(const ScalarType *a, const ScalarType *b, ScalarType *c) {
+  inline void VectorSubtraction(const ScalarType *a, const ScalarType *b, ScalarType *c) const {
     for(unsigned long iVar = 0; iVar < nVar; iVar++)
       c[iVar] = a[iVar] - b[iVar];
   }
@@ -215,47 +212,62 @@ class CSysMatrix {
   /*!
    * \brief Subtract b from a and store the result in c.
    */
-  inline void MatrixSubtraction(const ScalarType *a, const ScalarType *b, ScalarType *c) {
+  inline void MatrixSubtraction(const ScalarType *a, const ScalarType *b, ScalarType *c) const {
     for(unsigned long iVar = 0; iVar < nVar*nEqn; iVar++)
       c[iVar] = a[iVar] - b[iVar];
   }
 
+  /*!
+   * \brief Copy matrix src into dst, transpose is required.
+   */
+  inline void MatrixCopy(const ScalarType *src, ScalarType *dst, bool transposed = false) const {
+    if (!transposed) {
+      for(auto iVar = 0ul; iVar < nVar*nVar; ++iVar)
+        dst[iVar] = src[iVar];
+    }
+    else {
+      for (auto iVar = 0ul; iVar < nVar; ++iVar)
+        for (auto jVar = 0ul; jVar < nVar; ++jVar)
+          dst[iVar*nVar+jVar] = src[jVar*nVar+iVar];
+    }
+  }
+
   /*!
    * \brief Solve a small (nVar x nVar) linear system using Gaussian elimination.
    * \param[in,out] matrix - On entry the system matrix, on exit the factorized matrix.
    * \param[in,out] vec - On entry the rhs, on exit the solution.
    */
-  inline void Gauss_Elimination(ScalarType* matrix, ScalarType* vec);
+  void Gauss_Elimination(ScalarType* matrix, ScalarType* vec) const;
 
   /*!
    * \brief Invert a small dense matrix.
-   * \param[in] matrix - the matrix.
+   * \param[in,out] matrix - On entry the system matrix, on exit the factorized matrix.
    * \param[out] inverse - the matrix inverse.
    */
-  inline void MatrixInverse(const ScalarType *matrix, ScalarType *inverse);
+  void MatrixInverse(ScalarType *matrix, ScalarType *inverse) const;
 
   /*!
-   * \brief Performs the Gauss Elimination algorithm to solve the linear subsystem of the (i, i) subblock and rhs.
-   * \param[in] block_i - Index of the (i, i) subblock in the matrix-by-blocks structure.
+   * \brief Performs the Gauss Elimination algorithm to solve the linear subsystem of the (i,i) subblock and rhs.
+   * \param[in] block_i - Index of the (i,i) diagonal block.
    * \param[in] rhs - Right-hand-side of the linear system.
    * \param[in] transposed - If true the transposed of the block is used (default = false).
    * \return Solution of the linear system (overwritten on rhs).
    */
-  inline void Gauss_Elimination(unsigned long block_i, ScalarType* rhs, bool transposed = false);
+  inline void Gauss_Elimination(unsigned long block_i, ScalarType* rhs, bool transposed = false) const;
 
   /*!
    * \brief Inverse diagonal block.
    * \param[in] block_i - Indexes of the block in the matrix-by-blocks structure.
    * \param[out] invBlock - Inverse block.
    */
-  inline void InverseDiagonalBlock(unsigned long block_i, ScalarType *invBlock, bool transpose = false);
+  inline void InverseDiagonalBlock(unsigned long block_i, ScalarType *invBlock, bool transposed = false) const;
 
   /*!
    * \brief Inverse diagonal block.
    * \param[in] block_i - Indexes of the block in the matrix-by-blocks structure.
    * \param[out] invBlock - Inverse block.
    */
-  inline void InverseDiagonalBlock_ILUMatrix(unsigned long block_i, ScalarType *invBlock);
+  inline void InverseDiagonalBlock_ILUMatrix(unsigned long block_i, ScalarType *invBlock) const;
 
   /*!
    * \brief Copies the block (i, j) of the matrix-by-blocks structure in the internal variable *block.
@@ -280,29 +292,25 @@ class CSysMatrix {
    */
   inline void SetBlockTransposed_ILUMatrix(unsigned long block_i, unsigned long block_j, ScalarType *val_block);
 
-  /*!
-   * \brief Subtracts the specified block to the sparse matrix.
-   * \param[in] block_i - Indexes of the block in the matrix-by-blocks structure.
-   * \param[in] block_j - Indexes of the block in the matrix-by-blocks structure.
-   * \param[in] **val_block - Block to subtract to A(i, j).
-   */
-  inline void SubtractBlock_ILUMatrix(unsigned long block_i, unsigned long block_j, ScalarType *val_block);
-
   /*!
    * \brief Performs the product of i-th row of the upper part of a sparse matrix by a vector.
    * \param[in] vec - Vector to be multiplied by the upper part of the sparse matrix A.
    * \param[in] row_i - Row of the matrix to be multiplied by vector vec.
-   * \return prod Result of the product U(A)*vec (stored at *prod_row_vector).
+   * \param[in] col_ub - Exclusive upper bound for column indices considered in multiplication.
+   * \param[out] prod - Result of the product U(A)*vec.
    */
-  void UpperProduct(const CSysVector<ScalarType> & vec, unsigned long row_i);
+  inline void UpperProduct(const CSysVector<ScalarType> & vec, unsigned long row_i,
+                           unsigned long col_ub, ScalarType *prod) const;
 
   /*!
    * \brief Performs the product of i-th row of the lower part of a sparse matrix by a vector.
    * \param[in] vec - Vector to be multiplied by the lower part of the sparse matrix A.
    * \param[in] row_i - Row of the matrix to be multiplied by vector vec.
-   * \return prod Result of the product L(A)*vec (stored at *prod_row_vector).
+   * \param[in] col_lb - Inclusive lower bound for column indices considered in multiplication.
+   * \param[out] prod - Result of the product L(A)*vec.
    */
-  void LowerProduct(const CSysVector<ScalarType> & vec, unsigned long row_i);
+  inline void LowerProduct(const CSysVector<ScalarType> & vec, unsigned long row_i,
+                           unsigned long col_lb, ScalarType *prod) const;
 
   /*!
    * \brief Performs the product of i-th row of the diagonal part of a sparse matrix by a vector.
@@ -310,7 +318,7 @@ class CSysMatrix {
    * \param[in] row_i - Row of the matrix to be multiplied by vector vec.
    * \return prod Result of the product D(A)*vec (stored at *prod_row_vector).
    */
-  void DiagonalProduct(const CSysVector<ScalarType> & vec, unsigned long row_i);
+  inline void DiagonalProduct(const CSysVector<ScalarType> & vec, unsigned long row_i, ScalarType *prod) const;
 
   /*!
    * \brief Performs the product of i-th row of a sparse matrix by a vector.
@@ -318,7 +326,7 @@ class CSysMatrix {
    * \param[in] row_i - Row of the matrix to be multiplied by vector vec.
    * \return Result of the product (stored at *prod_row_vector).
    */
-  void RowProduct(const CSysVector<ScalarType> & vec, unsigned long row_i);
+  void RowProduct(const CSysVector<ScalarType> & vec, unsigned long row_i, ScalarType *prod) const;
 
 public:
 
@@ -334,38 +342,44 @@ class CSysMatrix {
 
   /*!
    * \brief Initializes sparse matrix system.
-   * \param[in] nVar - Number of variables.
-   * \param[in] nEqn - Number of equations.
+   * \param[in] npoint - Number of points including halos.
+   * \param[in] npointdomain - Number of points excluding halos.
+   * \param[in] nvar - Number of variables.
+   * \param[in] neqn - Number of equations.
    * \param[in] geometry - Geometrical definition of the problem.
    * \param[in] config - Definition of the particular problem.
    */
-  void Initialize(unsigned long nPoint, unsigned long nPointDomain, unsigned short nVar, unsigned short nEqn,
+  void Initialize(unsigned long npoint, unsigned long npointdomain,
+                  unsigned short nvar, unsigned short neqn,
                   bool EdgeConnect, CGeometry *geometry, CConfig *config);
 
   /*!
    * \brief Sets to zero all the entries of the sparse matrix.
    */
-  inline void SetValZero(void) {
-    if(matrix != NULL)
-      for (unsigned long index = 0; index < nnz*nVar*nEqn; index++)
-        matrix[index] = 0.0;
-  }
+  void SetValZero(void);
+
+  /*!
+   * \brief Sets to zero all the block diagonal entries of the sparse matrix.
+   */
+  void SetValDiagonalZero(void);
 
   /*!
-   * \brief Routine to load a vector quantity into the data structures for MPI point-to-point communication and to launch non-blocking sends and recvs.
+   * \brief Routine to load a vector quantity into the data structures for MPI point-to-point
+   *        communication and to launch non-blocking sends and recvs.
    * \param[in] x        - CSysVector holding the array of data.
    * \param[in] geometry - Geometrical definition of the problem.
    * \param[in] config   - Definition of the particular problem.
    * \param[in] commType - Enumerated type for the quantity to be communicated.
    */
   template<class OtherType>
-  void InitiateComms(CSysVector<OtherType> & x,
+  void InitiateComms(const CSysVector<OtherType> & x,
                      CGeometry *geometry,
                      CConfig *config,
-                     unsigned short commType);
+                     unsigned short commType) const;
 
   /*!
-   * \brief Routine to complete the set of non-blocking communications launched by InitiateComms() and unpacking of the data in the vector.
+   * \brief Routine to complete the set of non-blocking communications launched by
+   *        InitiateComms() and unpacking of the data in the vector.
    * \param[in] x        - CSysVector holding the array of data.
    * \param[in] geometry - Geometrical definition of the problem.
    * \param[in] config   - Definition of the particular problem.
@@ -375,7 +389,7 @@ class CSysMatrix {
   void CompleteComms(CSysVector<OtherType> & x,
                      CGeometry *geometry,
                      CConfig *config,
-                     unsigned short commType);
+                     unsigned short commType) const;
 
   /*!
    * \brief Get a pointer to the start of block "ij"
@@ -388,8 +402,18 @@ class CSysMatrix {
     for (unsigned long index = row_ptr[block_i]; index < row_ptr[block_i+1]; index++)
       if (col_ind[index] == block_j)
         return &(matrix[index*nVar*nEqn]);
+    return nullptr;
+  }
+
+  /*!
+   * \brief Get a pointer to the start of block "ij", const version
+   */
+  inline const ScalarType *GetBlock(unsigned long block_i, unsigned long block_j) const {
 
-    return NULL;
+    for (unsigned long index = row_ptr[block_i]; index < row_ptr[block_i+1]; index++)
+      if (col_ind[index] == block_j)
+        return &(matrix[index*nVar*nEqn]);
+    return nullptr;
   }
 
   /*!
@@ -401,12 +425,11 @@ class CSysMatrix {
    * \return Value of the block entry.
    */
   inline ScalarType GetBlock(unsigned long block_i, unsigned long block_j,
-                             unsigned short iVar, unsigned short jVar) {
+                             unsigned short iVar, unsigned short jVar) const {
 
     for (unsigned long index = row_ptr[block_i]; index < row_ptr[block_i+1]; index++)
       if (col_ind[index] == block_j)
         return matrix[index*nVar*nEqn+iVar*nEqn+jVar];
-
     return 0.0;
   }
 
@@ -414,7 +437,7 @@ class CSysMatrix {
    * \brief Set the value of a block in the sparse matrix.
    * \param[in] block_i - Row index.
    * \param[in] block_j - Column index.
-   * \param[in] **val_block - Block to set to A(i, j).
+   * \param[in] val_block - Block to set to A(i, j).
    */
   template<class OtherType>
   inline void SetBlock(unsigned long block_i, unsigned long block_j, OtherType **val_block) {
@@ -435,7 +458,7 @@ class CSysMatrix {
    * \brief Set the value of a block in the sparse matrix.
    * \param[in] block_i - Row index.
    * \param[in] block_j - Column index.
-   * \param[in] *val_block - Block to set to A(i, j).
+   * \param[in] val_block - Block to set to A(i, j).
    */
   template<class OtherType>
   inline void SetBlock(unsigned long block_i, unsigned long block_j, OtherType *val_block) {
@@ -455,7 +478,7 @@ class CSysMatrix {
    * \brief Adds the specified block to the sparse matrix.
    * \param[in] block_i - Row index.
    * \param[in] block_j - Column index.
-   * \param[in] **val_block - Block to add to A(i, j).
+   * \param[in] val_block - Block to add to A(i, j).
    */
   template<class OtherType>
   inline void AddBlock(unsigned long block_i, unsigned long block_j, OtherType **val_block) {
@@ -476,7 +499,7 @@ class CSysMatrix {
    * \brief Subtracts the specified block to the sparse matrix.
    * \param[in] block_i - Row index.
    * \param[in] block_j - Column index.
-   * \param[in] **val_block - Block to subtract to A(i, j).
+   * \param[in] val_block - Block to subtract to A(i, j).
    */
   template<class OtherType>
   inline void SubtractBlock(unsigned long block_i, unsigned long block_j, OtherType **val_block) {
@@ -493,6 +516,38 @@ class CSysMatrix {
     }
   }
 
+  /*!
+   * \brief Update 4 blocks ii, ij, ji, jj (add to i* sub from j*).
+   * \note The template parameter Sign, can be used create a "subtractive"
+   *       update i.e. subtract from row i and add to row j instead.
+   * \param[in] edge - Index of edge that connects iPoint and jPoint.
+   * \param[in] iPoint - Row to which we add the blocks.
+   * \param[in] jPoint - Row from which we subtract the blocks.
+   * \param[in] block_i - Adds to ii, subs from ji.
+   * \param[in] block_j - Adds to ij, subs from jj.
+   */
+  template<class OtherType, int Sign = 1>
+  inline void UpdateBlocks(unsigned long iEdge, unsigned long iPoint, unsigned long jPoint,
+                           OtherType **block_i, OtherType **block_j) {
+
+    ScalarType *bii = &matrix[dia_ptr[iPoint]*nVar*nEqn];
+    ScalarType *bjj = &matrix[dia_ptr[jPoint]*nVar*nEqn];
+    ScalarType *bij = &matrix[edge_ptr(iEdge,0)*nVar*nEqn];
+    ScalarType *bji = &matrix[edge_ptr(iEdge,1)*nVar*nEqn];
+
+    unsigned long iVar, jVar, offset = 0;
+
+    for (iVar = 0; iVar < nVar; iVar++) {
+      for (jVar = 0; jVar < nEqn; jVar++) {
+        bii[offset] += PassiveAssign<ScalarType,OtherType>(block_i[iVar][jVar]) * Sign;
+        bij[offset] += PassiveAssign<ScalarType,OtherType>(block_j[iVar][jVar]) * Sign;
+        bji[offset] -= PassiveAssign<ScalarType,OtherType>(block_i[iVar][jVar]) * Sign;
+        bjj[offset] -= PassiveAssign<ScalarType,OtherType>(block_j[iVar][jVar]) * Sign;
+        ++offset;
+      }
+    }
+  }
+
   /*!
    * \brief Adds the specified value to the diagonal of the (i, i) subblock
    *        of the matrix-by-blocks structure.
@@ -554,6 +609,15 @@ class CSysMatrix {
   template<class OtherType>
   void EnforceSolutionAtNode(const unsigned long node_i, const OtherType *x_i, CSysVector<OtherType> & b);
 
+  /*!
+   * \brief Add a scaled sparse matrix to "this" (axpy-type operation, A = A+alpha*B).
+   * \note Matrices must have the same sparse pattern.
+   * \param[in] alpha - The scaling constant.
+   * \param[in] B - Matrix being.
+   */
+  template<class OtherType>
+  void MatrixMatrixAddition(OtherType alpha, const CSysMatrix<OtherType>& B);
+
   /*!
    * \brief Performs the product of a sparse matrix by a CSysVector.
    * \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
@@ -561,7 +625,8 @@ class CSysMatrix {
    * \param[in] config - Definition of the particular problem.
    * \param[out] prod - Result of the product.
    */
-  void MatrixVectorProduct(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config);
+  void MatrixVectorProduct(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                           CGeometry *geometry, CConfig *config) const;
 
   /*!
    * \brief Performs the product of a sparse matrix by a CSysVector.
@@ -570,7 +635,8 @@ class CSysMatrix {
    * \param[in] config - Definition of the particular problem.
    * \param[out] prod - Result of the product.
    */
-  void MatrixVectorProductTransposed(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config);
+  void MatrixVectorProductTransposed(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                     CGeometry *geometry, CConfig *config) const;
 
   /*!
    * \brief Build the Jacobi preconditioner.
@@ -584,7 +650,8 @@ class CSysMatrix {
    * \param[in] geometry - Geometrical definition of the problem.
    * \param[in] config - Definition of the particular problem.
    */
-  void ComputeJacobiPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config);
+  void ComputeJacobiPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                   CGeometry *geometry, CConfig *config) const;
 
   /*!
    * \brief Build the ILU preconditioner.
@@ -599,36 +666,41 @@ class CSysMatrix {
    * \param[in] geometry - Geometrical definition of the problem.
    * \param[in] config - Definition of the particular problem.
    */
-  void ComputeILUPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config);
+  void ComputeILUPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                CGeometry *geometry, CConfig *config) const;
 
   /*!
    * \brief Multiply CSysVector by the preconditioner
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
    */
-  void ComputeLU_SGSPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config);
+  void ComputeLU_SGSPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                   CGeometry *geometry, CConfig *config) const;
 
   /*!
    * \brief Build the Linelet preconditioner.
    * \param[in] geometry - Geometrical definition of the problem.
    * \param[in] config - Definition of the particular problem.
+   * \return Average number of points per linelet.
    */
-  unsigned short BuildLineletPreconditioner(CGeometry *geometry, CConfig *config);
+  unsigned long BuildLineletPreconditioner(CGeometry *geometry, CConfig *config);
 
   /*!
    * \brief Multiply CSysVector by the preconditioner
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
    */
-  void ComputeLineletPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config);
+  void ComputeLineletPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                    CGeometry *geometry, CConfig *config) const;
 
   /*!
-   * \brief Compute the residual Ax-b
-   * \param[in] sol - CSysVector to be multiplied by the preconditioner.
-   * \param[in] f - Result of the product A*vec.
-   * \param[out] res - Result of the product A*vec.
+   * \brief Compute the linear residual.
+   * \param[in] sol - Solution (x).
+   * \param[in] f - Right hand side (b).
+   * \param[out] res - Residual (Ax-b).
    */
-  void ComputeResidual(const CSysVector<ScalarType> & sol, const CSysVector<ScalarType> & f, CSysVector<ScalarType> & res);
+  void ComputeResidual(const CSysVector<ScalarType> & sol, const CSysVector<ScalarType> & f,
+                       CSysVector<ScalarType> & res) const;
 
   /*!
    * \brief Factorize matrix using PaStiX.
@@ -646,7 +718,8 @@ class CSysMatrix {
    * \param[in] geometry - Geometrical definition of the problem.
    * \param[in] config - Definition of the particular problem.
    */
-  void ComputePastixPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config);
+  void ComputePastixPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                   CGeometry *geometry, CConfig *config) const;
 
 };
 
diff --git a/Common/include/linear_algebra/CSysMatrix.inl b/Common/include/linear_algebra/CSysMatrix.inl
index 0eb1e8374e7d..de11c89e55f0 100644
--- a/Common/include/linear_algebra/CSysMatrix.inl
+++ b/Common/include/linear_algebra/CSysMatrix.inl
@@ -1,29 +1,18 @@
 /*!
- * \file matrix_structure.inl
- * \brief In-Line subroutines of the <i>matrix_structure.hpp</i> file.
- * \note These are the "private" inlines, they are not needed outside of
- * the .cpp file and so they are hidden to avoid triggering recompilation
- * of other units when changes are made here.
- *
+ * \file CSysMatrix.inl
+ * \brief Inline subroutines of the <i>CSysMatrix.hpp</i> file.
+ * \note These are the "private" inlines, they are not needed outside
+ *       of the .cpp file and so they are hidden to avoid triggering
+ *       recompilation of other units when changes are made here.
  * \author F. Palacios, A. Bueno, T. Economon
  * \version 7.0.0 "Blackbird"
  *
- * The current SU2 release has been coordinated by the
- * SU2 International Developers Society <www.su2devsociety.org>
- * with selected contributions from the open-source community.
+ * SU2 Project Website: https://su2code.github.io
  *
- * The main research teams contributing to the current release are:
- *  - Prof. Juan J. Alonso's group at Stanford University.
- *  - Prof. Piero Colonna's group at Delft University of Technology.
- *  - Prof. Nicolas R. Gauger's group at Kaiserslautern University of Technology.
- *  - Prof. Alberto Guardone's group at Polytechnic University of Milan.
- *  - Prof. Rafael Palacios' group at Imperial College London.
- *  - Prof. Vincent Terrapon's group at the University of Liege.
- *  - Prof. Edwin van der Weide's group at the University of Twente.
- *  - Lab. of New Concepts in Aeronautics at Tech. Institute of Aeronautics.
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
  *
- * Copyright 2012-2019, Francisco D. Palacios, Thomas D. Economon,
- *                      Tim Albring, and the SU2 contributors.
+ * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
  *
  * SU2 is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -43,18 +32,25 @@
 
 #include "CSysMatrix.hpp"
 
+#if defined(_MSC_VER)
+  #define FORCEINLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+  #define FORCEINLINE inline __attribute__((always_inline))
+#else
+  #define FORCEINLINE inline
+#endif
+
 template<class ScalarType>
-inline ScalarType *CSysMatrix<ScalarType>::GetBlock_ILUMatrix(unsigned long block_i, unsigned long block_j) {
+FORCEINLINE ScalarType *CSysMatrix<ScalarType>::GetBlock_ILUMatrix(unsigned long block_i, unsigned long block_j) {
 
   for (unsigned long index = row_ptr_ilu[block_i]; index < row_ptr_ilu[block_i+1]; index++)
     if (col_ind_ilu[index] == block_j)
-      return &(ILU_matrix[index*nVar*nEqn]);
-
-  return NULL;
+      return &ILU_matrix[index*nVar*nEqn];
+  return nullptr;
 }
 
 template<class ScalarType>
-inline void CSysMatrix<ScalarType>::SetBlock_ILUMatrix(unsigned long block_i, unsigned long block_j, ScalarType *val_block) {
+FORCEINLINE void CSysMatrix<ScalarType>::SetBlock_ILUMatrix(unsigned long block_i, unsigned long block_j, ScalarType *val_block) {
 
   unsigned long iVar, index;
 
@@ -65,11 +61,10 @@ inline void CSysMatrix<ScalarType>::SetBlock_ILUMatrix(unsigned long block_i, un
       break;
     }
   }
-
 }
 
 template<class ScalarType>
-inline void CSysMatrix<ScalarType>::SetBlockTransposed_ILUMatrix(unsigned long block_i, unsigned long block_j, ScalarType *val_block) {
+FORCEINLINE void CSysMatrix<ScalarType>::SetBlockTransposed_ILUMatrix(unsigned long block_i, unsigned long block_j, ScalarType *val_block) {
 
   unsigned long iVar, jVar, index;
 
@@ -81,23 +76,10 @@ inline void CSysMatrix<ScalarType>::SetBlockTransposed_ILUMatrix(unsigned long b
       break;
     }
   }
-
-}
-
-template<class ScalarType>
-inline void CSysMatrix<ScalarType>::SubtractBlock_ILUMatrix(unsigned long block_i, unsigned long block_j, ScalarType *val_block) {
-
-  for (unsigned long index = row_ptr_ilu[block_i]; index < row_ptr_ilu[block_i+1]; index++) {
-    if (col_ind_ilu[index] == block_j) {
-      MatrixSubtraction(&ILU_matrix[index*nVar*nEqn], val_block, &ILU_matrix[index*nVar*nEqn]);
-      break;
-    }
-  }
-
 }
 
 template<class T, bool alpha, bool beta, bool transp>
-inline void gemv_impl(const unsigned long n, const T *a, const T *b, T *c) {
+FORCEINLINE void gemv_impl(const unsigned long n, const T *a, const T *b, T *c) {
   /*---
    This is a templated version of GEMV with the constants as boolean
    template parameters so that they can be optimized away at compilation.
@@ -114,7 +96,7 @@ inline void gemv_impl(const unsigned long n, const T *a, const T *b, T *c) {
 }
 
 template<class T>
-inline void gemm_impl(const unsigned long n, const T *a, const T *b, T *c) {
+FORCEINLINE void gemm_impl(const unsigned long n, const T *a, const T *b, T *c) {
   /*--- Same deal as for GEMV but here only the type is templated. ---*/
   unsigned long i, j, k;
   for (i = 0; i < n; i++) {
@@ -127,7 +109,7 @@ inline void gemm_impl(const unsigned long n, const T *a, const T *b, T *c) {
 }
 
 #define __MATVECPROD_SIGNATURE__(TYPE,NAME) \
-inline void CSysMatrix<TYPE>::NAME(const TYPE *matrix, const TYPE *vector, TYPE *product)
+FORCEINLINE void CSysMatrix<TYPE>::NAME(const TYPE *matrix, const TYPE *vector, TYPE *product) const
 
 #define MATVECPROD_SIGNATURE(NAME) template<class ScalarType> __MATVECPROD_SIGNATURE__(ScalarType,NAME)
 
@@ -153,7 +135,7 @@ MATVECPROD_SIGNATURE( MatrixVectorProductTransp ) {
 }
 
 template<class ScalarType>
-inline void CSysMatrix<ScalarType>::MatrixMatrixProduct(const ScalarType *matrix_a, const ScalarType *matrix_b, ScalarType *product) {
+FORCEINLINE void CSysMatrix<ScalarType>::MatrixMatrixProduct(const ScalarType *matrix_a, const ScalarType *matrix_b, ScalarType *product) const {
   gemm_impl<ScalarType>(nVar, matrix_a, matrix_b, product);
 }
 #else
@@ -179,7 +161,7 @@ MATVECPROD_SIGNATURE( MatrixVectorProductTransp ) {
 }
 
 template<class ScalarType>
-inline void CSysMatrix<ScalarType>::MatrixMatrixProduct(const ScalarType *matrix_a, const ScalarType *matrix_b, ScalarType *product) {
+FORCEINLINE void CSysMatrix<ScalarType>::MatrixMatrixProduct(const ScalarType *matrix_a, const ScalarType *matrix_b, ScalarType *product) const {
   MatrixMatrixProductKernel(MatrixMatrixProductJitter, const_cast<ScalarType*>(matrix_a),
                             const_cast<ScalarType*>(matrix_b), product );
 }
@@ -203,7 +185,7 @@ MATVECPROD_SPECIALIZATION( MatrixVectorProductTransp ) {
 }
 
 template<>
-inline void CSysMatrix<su2double>::MatrixMatrixProduct(const su2double *matrix_a, const su2double *matrix_b, su2double *product) {
+FORCEINLINE void CSysMatrix<su2double>::MatrixMatrixProduct(const su2double *matrix_a, const su2double *matrix_b, su2double *product) const {
   gemm_impl<su2double>(nVar, matrix_a, matrix_b, product);
 }
 #undef MATVECPROD_SPECIALIZATION
@@ -214,146 +196,66 @@ inline void CSysMatrix<su2double>::MatrixMatrixProduct(const su2double *matrix_a
 #undef __MATVECPROD_SIGNATURE__
 
 template<class ScalarType>
-inline void CSysMatrix<ScalarType>::Gauss_Elimination(ScalarType* matrix, ScalarType* vec) {
+FORCEINLINE void CSysMatrix<ScalarType>::Gauss_Elimination(unsigned long block_i, ScalarType* rhs, bool transposed) const {
 
-  /*---
-   This is a relatively large method to inline but maybe better
-   code will be generated for the special case nVar=1 this way.
-  ---*/
-
-  if (nVar==1) {vec[0] /= matrix[0]; return;}
-
-#ifdef USE_MKL_LAPACK
-  // With MKL_DIRECT_CALL enabled, this is significantly faster than native code on Intel Architectures.
-  LAPACKE_dgetrf( LAPACK_ROW_MAJOR, nVar, nVar, matrix, nVar, mkl_ipiv );
-  LAPACKE_dgetrs( LAPACK_ROW_MAJOR, 'N', nVar, 1, matrix, nVar, mkl_ipiv, vec, 1 );
-#else
-  int iVar, jVar, kVar, nvar = int(nVar);
-  ScalarType weight;
-
-  /*--- Transform system in Upper Matrix ---*/
-  for (iVar = 1; iVar < nvar; iVar++) {
-    for (jVar = 0; jVar < iVar; jVar++) {
-      weight = matrix[iVar*nvar+jVar] / matrix[jVar*nvar+jVar];
-      for (kVar = jVar; kVar < nvar; kVar++)
-        matrix[iVar*nvar+kVar] -= weight*matrix[jVar*nvar+kVar];
-      vec[iVar] -= weight*vec[jVar];
-    }
-  }
+  /*--- Copy block, as the algorithm modifies the matrix ---*/
+  ScalarType block[MAXNVAR*MAXNVAR];
+  MatrixCopy(&matrix[dia_ptr[block_i]*nVar*nVar], block, transposed);
 
-  /*--- Backwards substitution ---*/
-  for (iVar = nvar-1; iVar >= 0; iVar--) {
-    for (jVar = iVar+1; jVar < nvar; jVar++)
-      vec[iVar] -= matrix[iVar*nvar+jVar]*vec[jVar];
-    vec[iVar] /= matrix[iVar*nvar+iVar];
-  }
-#endif
+  Gauss_Elimination(block, rhs);
 }
 
 template<class ScalarType>
-inline void CSysMatrix<ScalarType>::MatrixInverse(const ScalarType *matrix, ScalarType *inverse) {
+FORCEINLINE void CSysMatrix<ScalarType>::InverseDiagonalBlock(unsigned long block_i, ScalarType *invBlock, bool transposed) const {
 
-  /*---
-   This is a generalization of Gaussian elimination for multiple rhs' (the basis vectors).
-   We could call "Gauss_Elimination" multiple times or fully generalize it for multiple rhs,
-   the performance of both routines would suffer in both cases without the use of exotic templating.
-   And so it feels reasonable to have some duplication here.
-  ---*/
-
-  if (nVar==1) {inverse[0] = 1.0/matrix[0]; return;}
-
-  int iVar, jVar, nvar = int(nVar);
+  /*--- Copy block, as the algorithm modifies the matrix ---*/
+  ScalarType block[MAXNVAR*MAXNVAR];
+  MatrixCopy(&matrix[dia_ptr[block_i]*nVar*nVar], block, transposed);
 
-  /*--- Initialize the inverse and make a copy of the matrix ---*/
-  for (iVar = 0; iVar < nvar; iVar++) {
-    for (jVar = 0; jVar < nvar; jVar++) {
-      block[iVar*nvar+jVar] = matrix[iVar*nvar+jVar];
-      inverse[iVar*nvar+jVar] = ScalarType(iVar==jVar); // identity
-    }
-  }
+  MatrixInverse(block, invBlock);
+}
 
-  /*--- Inversion ---*/
-#ifdef USE_MKL_LAPACK
-  // With MKL_DIRECT_CALL enabled, this is significantly faster than native code on Intel Architectures.
-  LAPACKE_dgetrf( LAPACK_ROW_MAJOR, nVar, nVar, block, nVar, mkl_ipiv );
-  LAPACKE_dgetrs( LAPACK_ROW_MAJOR, 'N', nVar, nVar, block, nVar, mkl_ipiv, inverse, nVar );
-#else
-  int kVar;
-  ScalarType weight;
+template<class ScalarType>
+FORCEINLINE void CSysMatrix<ScalarType>::InverseDiagonalBlock_ILUMatrix(unsigned long block_i, ScalarType *invBlock) const {
 
-  /*--- Transform system in Upper Matrix ---*/
-  for (iVar = 1; iVar < nvar; iVar++) {
-    for (jVar = 0; jVar < iVar; jVar++)
-    {
-      weight = block[iVar*nvar+jVar] / block[jVar*nvar+jVar];
+  /*--- Copy block, as the algorithm modifies the matrix ---*/
+  ScalarType block[MAXNVAR*MAXNVAR];
+  MatrixCopy(&ILU_matrix[dia_ptr_ilu[block_i]*nVar*nVar], block, false);
 
-      for (kVar = jVar; kVar < nvar; kVar++)
-        block[iVar*nvar+kVar] -= weight*block[jVar*nvar+kVar];
+  MatrixInverse(block, invBlock);
+}
 
-      /*--- at this stage "inverse" is lower triangular so not all cols need updating ---*/
-      for (kVar = 0; kVar <= jVar; kVar++)
-        inverse[iVar*nvar+kVar] -= weight*inverse[jVar*nvar+kVar];
-    }
-  }
+template<class ScalarType>
+FORCEINLINE void CSysMatrix<ScalarType>::UpperProduct(const CSysVector<ScalarType> & vec, unsigned long row_i,
+                                                      unsigned long col_ub, ScalarType *prod) const {
+  unsigned long iVar, index, col_j;
 
-  /*--- Backwards substitution ---*/
-  for (iVar = nvar-1; iVar >= 0; iVar--)
-  {
-    for (jVar = iVar+1; jVar < nvar; jVar++)
-      for (kVar = 0; kVar < nvar; kVar++)
-        inverse[iVar*nvar+kVar] -= block[iVar*nvar+jVar] * inverse[jVar*nvar+kVar];
+  for (iVar = 0; iVar < nVar; iVar++) prod[iVar] = 0.0;
 
-    for (kVar = 0; kVar < nvar; kVar++)
-      inverse[iVar*nvar+kVar] /= block[iVar*nvar+iVar];
+  for (index = dia_ptr[row_i]+1; index < row_ptr[row_i+1]; index++) {
+    col_j = col_ind[index];
+    if (col_j < col_ub)
+      MatrixVectorProductAdd(&matrix[index*nVar*nVar], &vec[col_j*nVar], prod);
   }
-#endif
 }
 
 template<class ScalarType>
-inline void CSysMatrix<ScalarType>::Gauss_Elimination(unsigned long block_i, ScalarType* rhs, bool transposed) {
+FORCEINLINE void CSysMatrix<ScalarType>::LowerProduct(const CSysVector<ScalarType> & vec, unsigned long row_i,
+                                                      unsigned long col_lb, ScalarType *prod) const {
+  unsigned long iVar, index, col_j;
 
-  unsigned long iVar, jVar;
-  ScalarType *Block = GetBlock(block_i, block_i);
-
-  /*--- Copy block, as the algorithm modifies the matrix ---*/
+  for (iVar = 0; iVar < nVar; iVar++) prod[iVar] = 0.0;
 
-  if (!transposed) {
-    // If source and dest overlap higher level problems occur, so memcpy is safe. And it is faster.
-    memcpy( block, Block, nVar*nVar*sizeof(ScalarType) );
-
-//    for (iVar = 0; iVar < nVar*nVar; iVar++)
-//       block[iVar] = Block[iVar];
-
-  } else {
-    for (iVar = 0; iVar < nVar; iVar++)
-      for (jVar = 0; jVar < nVar; jVar++)
-        block[iVar*nVar+jVar] = Block[jVar*nVar+iVar];
+  for (index = row_ptr[row_i]; index < dia_ptr[row_i]; index++) {
+    col_j = col_ind[index];
+    if (col_j >= col_lb)
+      MatrixVectorProductAdd(&matrix[index*nVar*nVar], &vec[col_j*nVar], prod);
   }
-
-  /*--- Solve system ---*/
-
-  Gauss_Elimination(block, rhs);
-
-}
-
-template<class ScalarType>
-inline void CSysMatrix<ScalarType>::InverseDiagonalBlock(unsigned long block_i, ScalarType *invBlock, bool transpose) {
-
-  const ScalarType* mat = GetBlock(block_i, block_i);
-  MatrixInverse(mat, invBlock);
-
-  if (transpose) // swap off-diag
-    for (unsigned long iVar = 0; iVar < nVar-1; ++iVar)
-      for (unsigned long jVar = iVar+1; jVar < nVar; ++jVar) {
-        ScalarType tmp = invBlock[iVar*nVar+jVar];
-        invBlock[iVar*nVar+jVar] = invBlock[jVar*nVar+iVar];
-        invBlock[jVar*nVar+iVar] = tmp;
-      }
 }
 
 template<class ScalarType>
-inline void CSysMatrix<ScalarType>::InverseDiagonalBlock_ILUMatrix(unsigned long block_i, ScalarType *invBlock) {
+FORCEINLINE void CSysMatrix<ScalarType>::DiagonalProduct(const CSysVector<ScalarType> & vec,
+                                                         unsigned long row_i, ScalarType *prod) const {
 
-  const ScalarType* mat = GetBlock_ILUMatrix(block_i, block_i);
-  MatrixInverse(mat, invBlock);
+  MatrixVectorProduct(&matrix[dia_ptr[row_i]*nVar*nVar], &vec[row_i*nVar], prod);
 }
diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp
index 661972f05421..a2e8f1b67845 100644
--- a/Common/include/linear_algebra/CSysSolve.hpp
+++ b/Common/include/linear_algebra/CSysSolve.hpp
@@ -7,7 +7,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -31,8 +31,6 @@
 
 #include "../mpi_structure.hpp"
 
-#include <climits>
-#include <limits>
 #include <cmath>
 #include <vector>
 #include <iostream>
@@ -40,13 +38,13 @@
 #include <iomanip>
 #include <string>
 
-#include "../option_structure.hpp"
-#include "../config_structure.hpp"
-#include "../geometry/CGeometry.hpp"
 #include "CSysVector.hpp"
-#include "CSysMatrix.hpp"
-#include "CMatrixVectorProduct.hpp"
-#include "CPreconditioner.hpp"
+
+class CConfig;
+class CGeometry;
+template<class T> class CSysMatrix;
+template<class T> class CMatrixVectorProduct;
+template<class T> class CPreconditioner;
 
 using namespace std;
 
@@ -75,26 +73,26 @@ class CSysSolve {
   bool mesh_deform;    /*!< \brief Operate in mesh deformation mode, changes the source of solver options. */
   ScalarType Residual; /*!< \brief Residual at the end of a call to Solve. */
 
-  bool cg_ready;     /*!< \brief Indicate if memory used by CG is allocated. */
-  bool bcg_ready;    /*!< \brief Indicate if memory used by BCGSTAB is allocated. */
-  bool gmres_ready;  /*!< \brief Indicate if memory used by FGMRES is allocated. */
-  bool smooth_ready; /*!< \brief Indicate if memory used by SMOOTHER is allocated. */
+  mutable bool cg_ready;     /*!< \brief Indicate if memory used by CG is allocated. */
+  mutable bool bcg_ready;    /*!< \brief Indicate if memory used by BCGSTAB is allocated. */
+  mutable bool gmres_ready;  /*!< \brief Indicate if memory used by FGMRES is allocated. */
+  mutable bool smooth_ready; /*!< \brief Indicate if memory used by SMOOTHER is allocated. */
 
-  VectorType r;      /*!< \brief Residual in CG and BCGSTAB. */
-  VectorType A_x;    /*!< \brief Result of matrix-vector product in CG and BCGSTAB. */
-  VectorType p;      /*!< \brief Direction in CG and BCGSTAB. */
-  VectorType z;      /*!< \brief Preconditioned residual/direction in CG/BCGSTAB. */
+  mutable VectorType r;      /*!< \brief Residual in CG and BCGSTAB. */
+  mutable VectorType A_x;    /*!< \brief Result of matrix-vector product in CG and BCGSTAB. */
+  mutable VectorType p;      /*!< \brief Direction in CG and BCGSTAB. */
+  mutable VectorType z;      /*!< \brief Preconditioned residual/direction in CG/BCGSTAB. */
 
-  VectorType r_0;    /*!< \brief The "arbitrary" vector in BCGSTAB. */
-  VectorType v;      /*!< \brief BCGSTAB "v" vector (v = A * M^-1 * p). */
+  mutable VectorType r_0;    /*!< \brief The "arbitrary" vector in BCGSTAB. */
+  mutable VectorType v;      /*!< \brief BCGSTAB "v" vector (v = A * M^-1 * p). */
 
-  vector<VectorType> W;  /*!< \brief Large matrix used by FGMRES, w^i+1 = A * z^i. */
-  vector<VectorType> Z;  /*!< \brief Large matrix used by FGMRES, preconditioned W. */
+  mutable vector<VectorType> W;  /*!< \brief Large matrix used by FGMRES, w^i+1 = A * z^i. */
+  mutable vector<VectorType> Z;  /*!< \brief Large matrix used by FGMRES, preconditioned W. */
 
-  VectorType  LinSysRes_tmp;  /*!< \brief Temporary used when it is necessary to interface between active and passive types. */
-  VectorType  LinSysSol_tmp;  /*!< \brief Temporary used when it is necessary to interface between active and passive types. */
-  VectorType* LinSysRes_ptr;  /*!< \brief Pointer to appropriate LinSysRes (set to original or temporary in call to Solve). */
-  VectorType* LinSysSol_ptr;  /*!< \brief Pointer to appropriate LinSysSol (set to original or temporary in call to Solve). */
+  VectorType  LinSysSol_tmp;        /*!< \brief Temporary used when it is necessary to interface between active and passive types. */
+  VectorType  LinSysRes_tmp;        /*!< \brief Temporary used when it is necessary to interface between active and passive types. */
+  VectorType* LinSysSol_ptr;        /*!< \brief Pointer to appropriate LinSysSol (set to original or temporary in call to Solve). */
+  const VectorType* LinSysRes_ptr;  /*!< \brief Pointer to appropriate LinSysRes (set to original or temporary in call to Solve). */
 
   /*!
    * \brief sign transfer function
@@ -105,7 +103,7 @@ class CSysSolve {
    * so, feel free to delete this and replace it as needed with the
    * appropriate global function
    */
-  inline ScalarType Sign(const ScalarType & x, const ScalarType & y) const {
+  static inline ScalarType Sign(ScalarType x, ScalarType y) {
     if (y == 0.0) return 0.0;
     return fabs(x) * (y < 0.0 ? -1.0 : 1.0);
   }
@@ -117,7 +115,7 @@ class CSysSolve {
    * \param[in,out] h1 - first element of 2x1 vector being transformed
    * \param[in,out] h2 - second element of 2x1 vector being transformed
    */
-  void ApplyGivens(const ScalarType & s, const ScalarType & c, ScalarType & h1, ScalarType & h2);
+  void ApplyGivens(ScalarType s, ScalarType c, ScalarType & h1, ScalarType & h2) const;
 
   /*!
    * \brief generates the Givens rotation matrix for a given 2-vector
@@ -129,7 +127,7 @@ class CSysSolve {
    * Based on givens() of SPARSKIT, which is based on p.202 of
    * "Matrix Computations" by Golub and van Loan.
    */
-  void GenerateGivens(ScalarType & dx, ScalarType & dy, ScalarType & s, ScalarType & c);
+  void GenerateGivens(ScalarType & dx, ScalarType & dy, ScalarType & s, ScalarType & c) const;
 
   /*!
    * \brief finds the solution of the upper triangular system Hsbg*x = rhs
@@ -142,17 +140,16 @@ class CSysSolve {
    * \pre the upper Hessenberg matrix has been transformed into a
    * triangular matrix.
    */
-  void SolveReduced(const int & n, const vector<vector<ScalarType> > & Hsbg,
-                    const vector<ScalarType> & rhs, vector<ScalarType> & x);
+  void SolveReduced(int n, const vector<vector<ScalarType> > & Hsbg,
+                    const vector<ScalarType> & rhs, vector<ScalarType> & x) const;
 
   /*!
    * \brief Modified Gram-Schmidt orthogonalization
    * \author Based on Kesheng John Wu's mgsro subroutine in Saad's SPARSKIT
    *
-   * \tparam Vec - a generic vector class
    * \param[in] i - index indicating which vector in w is being orthogonalized
-   * \param[in, out] Hsbg - the upper Hessenberg begin updated
-   * \param[in, out] w - the (i+1)th vector of w is orthogonalized against the
+   * \param[in,out] Hsbg - the upper Hessenberg begin updated
+   * \param[in,out] w - the (i+1)th vector of w is orthogonalized against the
    *                    previous vectors in w
    *
    * \pre the vectors w[0:i] are orthonormal
@@ -163,7 +160,7 @@ class CSysSolve {
    * vector is kept in nrm0 and updated after operating with each vector
    *
    */
-  void ModGramSchmidt(int i, vector<vector<ScalarType> > & Hsbg, vector<VectorType> & w);
+  void ModGramSchmidt(int i, vector<vector<ScalarType> > & Hsbg, vector<VectorType> & w) const;
 
   /*!
    * \brief writes header information for a CSysSolve residual history
@@ -173,24 +170,39 @@ class CSysSolve {
    *
    * \pre the ostream object os should be open
    */
-  void WriteHeader(const string & solver, const ScalarType & restol, const ScalarType & resinit);
+  void WriteHeader(string solver, ScalarType restol, ScalarType resinit) const;
 
   /*!
    * \brief writes residual convergence data for one iteration to a stream
    * \param[in] iter - current iteration
-   * \param[in] res - the (absolute) residual norm value
-   * \param[in] resinit - the initial residual norm
+   * \param[in] res - the residual norm to display
    *
    * \pre the ostream object os should be open
    */
-  void WriteHistory(const int & iter, const ScalarType & res, const ScalarType & resinit);
+  void WriteHistory(unsigned long iter, ScalarType res) const;
+
+  /*!
+   * \brief writes final residual convergence information
+   * \param[in] solver - string describing the solver
+   * \param[in] iter - current iteration
+   * \param[in] res - the residual norm
+   */
+  void WriteFinalResidual(string solver, unsigned long iter, ScalarType res) const;
+
+  /*!
+   * \brief writes the convergence warning
+   * \param[in] res_calc - the residual norm computed iteratively
+   * \param[in] res_true - the recomputed residual norm
+   * \param[in] tol - the residual norm
+   */
+  void WriteWarning(ScalarType res_calc, ScalarType res_true, ScalarType tol) const;
 
   /*!
    * \brief Used by Solve for compatibility between passive and active CSysVector, see specializations.
    * \param[in] LinSysRes - Linear system residual
    * \param[in,out] LinSysSol - Linear system solution
    */
-  void HandleTemporariesIn(CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol);
+  void HandleTemporariesIn(const CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol);
 
   /*!
    * \brief Used by Solve for compatibility between passive and active CSysVector, see specializations.
@@ -213,12 +225,13 @@ class CSysSolve {
    * \param[in] precond - object that defines preconditioner
    * \param[in] tol - tolerance with which to solve the system
    * \param[in] m - maximum size of the search subspace
+   * \param[out] residual - final normalized residual
    * \param[in] monitoring - turn on priting residuals from solver to screen.
    * \param[in] config - Definition of the particular problem.
    */
-  unsigned long CG_LinSolver(const VectorType & b, VectorType & x, ProductType & mat_vec,
-                             PrecondType & precond, ScalarType tol, unsigned long m,
-                             ScalarType *residual, bool monitoring, CConfig *config);
+  unsigned long CG_LinSolver(const VectorType & b, VectorType & x, const ProductType & mat_vec,
+                             const PrecondType & precond, ScalarType tol, unsigned long m,
+                             ScalarType & residual, bool monitoring, CConfig *config) const;
 
   /*!
    * \brief Flexible Generalized Minimal Residual method
@@ -228,13 +241,13 @@ class CSysSolve {
    * \param[in] precond - object that defines preconditioner
    * \param[in] tol - tolerance with which to solve the system
    * \param[in] m - maximum size of the search subspace
-   * \param[in] residual - norm of final residual
+   * \param[out] residual - final normalized residual
    * \param[in] monitoring - turn on priting residuals from solver to screen.
    * \param[in] config - Definition of the particular problem.
    */
-  unsigned long FGMRES_LinSolver(const VectorType & b, VectorType & x, ProductType & mat_vec,
-                                 PrecondType & precond, ScalarType tol, unsigned long m,
-                                 ScalarType *residual, bool monitoring, CConfig *config);
+  unsigned long FGMRES_LinSolver(const VectorType & b, VectorType & x, const ProductType & mat_vec,
+                                 const PrecondType & precond, ScalarType tol, unsigned long m,
+                                 ScalarType & residual, bool monitoring, CConfig *config) const;
 
   /*!
    * \brief Biconjugate Gradient Stabilized Method (BCGSTAB)
@@ -244,13 +257,13 @@ class CSysSolve {
    * \param[in] precond - object that defines preconditioner
    * \param[in] tol - tolerance with which to solve the system
    * \param[in] m - maximum size of the search subspace
-   * \param[in] residual - norm of final residual
+   * \param[out] residual - final normalized residual
    * \param[in] monitoring - turn on priting residuals from solver to screen.
    * \param[in] config - Definition of the particular problem.
    */
-  unsigned long BCGSTAB_LinSolver(const VectorType & b, VectorType & x, ProductType & mat_vec,
-                                  PrecondType & precond, ScalarType tol, unsigned long m,
-                                  ScalarType *residual, bool monitoring, CConfig *config);
+  unsigned long BCGSTAB_LinSolver(const VectorType & b, VectorType & x, const ProductType & mat_vec,
+                                  const PrecondType & precond, ScalarType tol, unsigned long m,
+                                  ScalarType & residual, bool monitoring, CConfig *config) const;
 
   /*!
    * \brief Generic smoother (modified Richardson iteration with preconditioner)
@@ -260,34 +273,34 @@ class CSysSolve {
    * \param[in] precond - object that defines preconditioner
    * \param[in] tol - tolerance with which to solve the system
    * \param[in] m - maximum number of iterations
-   * \param[in] residual - norm of final residual
+   * \param[out] residual - final normalized residual
    * \param[in] monitoring - turn on priting residuals from solver to screen.
    * \param[in] config - Definition of the particular problem.
    */
-  unsigned long Smoother_LinSolver(const VectorType & b, VectorType & x, ProductType & mat_vec,
-                                   PrecondType & precond, ScalarType tol, unsigned long m,
-                                   ScalarType *residual, bool monitoring, CConfig *config);
+  unsigned long Smoother_LinSolver(const VectorType & b, VectorType & x, const ProductType & mat_vec,
+                                   const PrecondType & precond, ScalarType tol, unsigned long m,
+                                   ScalarType & residual, bool monitoring, CConfig *config) const;
 
   /*!
    * \brief Solve the linear system using a Krylov subspace method
    * \param[in] Jacobian - Jacobian Matrix for the linear system
    * \param[in] LinSysRes - Linear system residual
-   * \param[in] LinSysSol - Linear system solution
+   * \param[in,out] LinSysSol - Linear system solution
    * \param[in] geometry -  Geometrical definition of the problem.
    * \param[in] config - Definition of the particular problem.
    */
-  unsigned long Solve(MatrixType & Jacobian, CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol,
+  unsigned long Solve(MatrixType & Jacobian, const CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol,
                       CGeometry *geometry, CConfig *config);
 
   /*!
    * \brief Solve the adjoint linear system using a Krylov subspace method
    * \param[in] Jacobian - Jacobian Matrix for the linear system
    * \param[in] LinSysRes - Linear system residual
-   * \param[in] LinSysSol - Linear system solution
+   * \param[in,out] LinSysSol - Linear system solution
    * \param[in] geometry -  Geometrical definition of the problem.
    * \param[in] config - Definition of the particular problem.
    */
-  unsigned long Solve_b(MatrixType & Jacobian, CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol,
+  unsigned long Solve_b(MatrixType & Jacobian, const CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol,
                         CGeometry *geometry, CConfig *config);
 
   /*!
diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp
index d58566d3342d..cc7f73e9d709 100644
--- a/Common/include/linear_algebra/CSysVector.hpp
+++ b/Common/include/linear_algebra/CSysVector.hpp
@@ -1,13 +1,13 @@
 /*!
- * \file vector_structure.hpp
- * \brief Headers for the classes related to linear solvers (CG, FGMRES, etc)
- *        The subroutines and functions are in the <i>linear_solvers_structure.cpp</i> file.
+ * \file CSysVector.hpp
+ * \brief Declararion of the vector class used in the solution of
+ *        large, distributed, sparse linear systems.
  * \author F. Palacios, J. Hicken, T. Economon
  * \version 7.0.0 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -26,24 +26,11 @@
  * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
  */
 
-
 #pragma once
 
-#include "../mpi_structure.hpp"
-
-#include <climits>
 #include <cmath>
-#include <vector>
-#include <iostream>
-#include <iomanip>
-#include <string>
 #include <cstdlib>
 
-using namespace std;
-
-/*--- Forward declaration of template friend functions. ---*/
-template<class T> class CSysVector;
-template<class T> T dotProd(const CSysVector<T> & u, const CSysVector<T> & v);
 
 /*!
  * \class CSysVector
@@ -52,22 +39,33 @@ template<class T> T dotProd(const CSysVector<T> & u, const CSysVector<T> & v);
  *
  * We could use the STL vector as a base class here, but this gives us
  * more flexibility with the underlying data (e.g. we may decide to
- * use a block storage scheme rather than a continuous storage
- * scheme).
+ * use a block storage scheme rather than a continuous storage scheme).
  */
 template<class ScalarType>
 class CSysVector {
 
 private:
-  unsigned long nElm;       /*!< \brief total number of elements (or number elements on this processor) */
-  unsigned long nElmDomain; /*!< \brief total number of elements (or number elements on this processor without Ghost cells) */
-#ifdef HAVE_MPI
-  unsigned long nElmGlobal; /*!< \brief total number of elements over all processors */
-#endif
-  unsigned short nVar;      /*!< \brief number of elements in a block */
-  unsigned long nBlk;       /*!< \brief number of blocks (or number of blocks on this processor) */
-  unsigned long nBlkDomain; /*!< \brief number of blocks (or number of blocks on this processor without Ghost cells) */
-  ScalarType* vec_val;      /*!< \brief storage for the element values */
+  enum { OMP_MAX_SIZE = 4096 };   /*!< \brief Maximum chunk size used in parallel for loops. */
+
+  unsigned long omp_chunk_size;   /*!< \brief Static chunk size used in loop, determined at initialization. */
+  ScalarType* vec_val;            /*!< \brief storage for the element values, 64 byte aligned (do not use normal new/delete) */
+  unsigned long nElm;             /*!< \brief total number of elements (or number elements on this processor) */
+  unsigned long nElmDomain;       /*!< \brief total number of elements (or number elements on this processor without Ghost cells) */
+  unsigned long nVar;             /*!< \brief number of elements in a block */
+  mutable ScalarType dotRes;      /*!< \brief result of dot product. to perform a reduction with OpenMP the
+                                              variable needs to be declared outside the parallel region */
+
+  /*!
+   * \brief Generic initialization from a scalar or array.
+   * \note If val==nullptr vec_val is not initialized, only allocated.
+   * \param[in] numBlk - number of blocks locally
+   * \param[in] numBlkDomain - number of blocks locally (without g cells)
+   * \param[in] numVar - number of variables in each block
+   * \param[in] val - default value for elements
+   * \param[in] valIsArray - if true val is treated as array
+   */
+  void Initialize(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar,
+                  const ScalarType* val, bool valIsArray);
 
 public:
 
@@ -81,29 +79,21 @@ class CSysVector {
    * \param[in] size - number of elements locally
    * \param[in] val - default value for elements
    */
-  CSysVector(const unsigned long & size, const ScalarType & val = 0.0);
+  CSysVector(unsigned long size, ScalarType val = 0.0) {
+    nElm = 0; vec_val = nullptr;
+    Initialize(size, size, 1, &val, false);
+  }
 
   /*!
    * \brief constructor of the class.
    * \param[in] numBlk - number of blocks locally
-   * \param[in] numBlkDomain
+   * \param[in] numBlkDomain - number of blocks locally (without g cells)
    * \param[in] numVar - number of variables in each block
    * \param[in] val - default value for elements
    */
-  CSysVector(const unsigned long & numBlk, const unsigned long & numBlkDomain, const unsigned short & numVar, const ScalarType & val = 0.0);
-
-  /*!
-   * \brief copy constructor of the class.
-   * \param[in] u - CSysVector that is being copied
-   */
-  CSysVector(const CSysVector & u);
-
-  /*!
-   * \brief Sets to zero all the entries of the vector.
-   */
-  inline void SetValZero(void) {
-    for (unsigned long i = 0; i < nElm; i++)
-      vec_val[i] = 0.0;
+  CSysVector(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar, ScalarType val = 0.0) {
+    nElm = 0; vec_val = nullptr;
+    Initialize(numBlk, numBlkDomain, numVar, &val, false);
   }
 
   /*!
@@ -111,7 +101,10 @@ class CSysVector {
    * \param[in] size - number of elements locally
    * \param[in] u_array - vector stored as array being copied
    */
-  explicit CSysVector(const unsigned long & size, const ScalarType* u_array);
+  explicit CSysVector(unsigned long size, const ScalarType* u_array) {
+    nElm = 0; vec_val = nullptr;
+    Initialize(size, size, 1, u_array, true);
+  }
 
   /*!
    * \brief constructor from array
@@ -120,22 +113,54 @@ class CSysVector {
    * \param[in] numVar - number of variables in each block
    * \param[in] u_array - vector stored as array being copied
    */
-  explicit CSysVector(const unsigned long & numBlk, const unsigned long & numBlkDomain, const unsigned short & numVar,
-                      const ScalarType* u_array);
+  explicit CSysVector(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar, const ScalarType* u_array) {
+    nElm = 0; vec_val = nullptr;
+    Initialize(numBlk, numBlkDomain, numVar, u_array, true);
+  }
+
+  /*!
+   * \brief copy constructor of the class.
+   * \param[in] u - CSysVector that is being copied
+   */
+  CSysVector(const CSysVector & u) {
+    nElm = 0; vec_val = nullptr;
+    Initialize(u.GetNBlk(), u.GetNBlkDomain(), u.nVar, u.vec_val, true);
+  }
+
+  /*!
+   * \brief Set our values (resizing if required) by copying from other, the derivative information is lost.
+   * \param[in] other - source CSysVector
+   */
+  template<class T>
+  void PassiveCopy(const CSysVector<T>& other);
 
   /*!
    * \brief class destructor
    */
-  virtual ~CSysVector();
+  ~CSysVector();
 
   /*!
-   * \brief Initialize the class.
+   * \brief Initialize the class with a scalar.
    * \param[in] numBlk - number of blocks locally
-   * \param[in] numBlkDomain
+   * \param[in] numBlkDomain - number of blocks locally (without g cells)
    * \param[in] numVar - number of variables in each block
    * \param[in] val - default value for elements
    */
-  void Initialize(const unsigned long & numBlk, const unsigned long & numBlkDomain, const unsigned short & numVar, const ScalarType & val = 0.0);
+  void Initialize(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar, ScalarType val = 0.0) {
+    Initialize(numBlk, numBlkDomain, numVar, &val, false);
+  }
+
+  /*!
+   * \brief Initialize the class with an array.
+   * \note If ptr==nullptr no copy occurs.
+   * \param[in] numBlk - number of blocks locally
+   * \param[in] numBlkDomain - number of blocks locally (without g cells)
+   * \param[in] numVar - number of variables in each block
+   * \param[in] ptr - pointer to data with which to initialize the vector
+   */
+  void Initialize(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar, const ScalarType* ptr) {
+    Initialize(numBlk, numBlkDomain, numVar, ptr, true);
+  }
 
   /*!
    * \brief return the number of local elements in the CSysVector
@@ -147,45 +172,34 @@ class CSysVector {
    */
   inline unsigned long GetNElmDomain() const { return nElmDomain; }
 
-  /*!
-   * \brief return the size of the CSysVector (over all processors)
-   */
-  inline unsigned long GetSize() const {
-#ifdef HAVE_MPI
-    return nElmGlobal;
-#else
-    return (unsigned long)nElm;
-#endif
-  }
-
   /*!
    * \brief return the number of variables at each block (typically number per node)
    */
-  inline unsigned short GetNVar() const { return nVar; }
+  inline unsigned long GetNVar() const { return nVar; }
 
   /*!
    * \brief return the number of blocks (typically number of nodes locally)
    */
-  inline unsigned long GetNBlk() const { return nBlk; }
+  inline unsigned long GetNBlk() const { return nElm/nVar; }
 
   /*!
    * \brief return the number of blocks (typically number of nodes locally)
    */
-  inline unsigned long GetNBlkDomain() const { return nBlkDomain; }
+  inline unsigned long GetNBlkDomain() const { return nElmDomain/nVar; }
 
   /*!
    * \brief set calling CSysVector to scaling of another CSysVector
    * \param[in] a - scalar factor for x
    * \param[in] x - CSysVector that is being scaled
    */
-  void Equals_AX(const ScalarType & a, CSysVector & x);
+  void Equals_AX(ScalarType a, const CSysVector & x);
 
   /*!
    * \brief adds a scaled CSysVector to calling CSysVector
    * \param[in] a - scalar factor for x
    * \param[in] x - CSysVector that is being scaled
    */
-  void Plus_AX(const ScalarType & a, CSysVector & x);
+  void Plus_AX(ScalarType a, const CSysVector & x);
 
   /*!
    * \brief general linear combination of two CSysVectors
@@ -194,7 +208,7 @@ class CSysVector {
    * \param[in] b - scalar factor for y
    * \param[in] y - second CSysVector in linear combination
    */
-  void Equals_AX_Plus_BY(const ScalarType & a, CSysVector & x, const ScalarType & b, CSysVector & y);
+  void Equals_AX_Plus_BY(ScalarType a, const CSysVector & x, ScalarType b, const CSysVector & y);
 
   /*!
    * \brief assignment operator with deep copy
@@ -206,13 +220,12 @@ class CSysVector {
    * \brief CSysVector=su2double assignment operator
    * \param[in] val - value assigned to each element of CSysVector
    */
-  CSysVector & operator=(const ScalarType & val);
+  CSysVector & operator=(ScalarType val);
 
   /*!
-   * \brief addition operator
-   * \param[in] u - CSysVector being added to *this
+   * \brief Sets to zero all the entries of the vector.
    */
-  CSysVector operator+(const CSysVector & u) const;
+  inline void SetValZero(void) { *this = ScalarType(0.0); }
 
   /*!
    * \brief compound addition-assignment operator
@@ -220,12 +233,6 @@ class CSysVector {
    */
   CSysVector & operator+=(const CSysVector & u);
 
-  /*!
-   * \brief subtraction operator
-   * \param[in] u - CSysVector being subtracted from *this
-   */
-  CSysVector operator-(const CSysVector & u) const;
-
   /*!
    * \brief compound subtraction-assignment operator
    * \param[in] u - CSysVector being subtracted from calling object
@@ -233,28 +240,35 @@ class CSysVector {
   CSysVector & operator-=(const CSysVector & u);
 
   /*!
-   * \brief vector * scalar multiplication operator
-   * \param[in] val - value to multiply *this by
+   * \brief compound scalar multiplication-assignment operator
+   * \param[in] val - value to multiply calling object by
+   */
+  CSysVector & operator*=(ScalarType val);
+
+  /*!
+   * \brief compound scalar division-assignment operator
+   * \param[in] val - value to divide elements of calling object by
    */
-  CSysVector operator*(const ScalarType & val) const;
+  CSysVector & operator/=(ScalarType val);
 
   /*!
-   * \brief compound scalar multiplication-assignment operator
-   * \param[in] val - value to multiply calling object by
+   * \brief Dot product between "this" and another vector
+   * \param[in] u - Another vector.
+   * \return result of dot product
    */
-  CSysVector & operator*=(const ScalarType & val);
+  ScalarType dot(const CSysVector & u) const;
 
   /*!
-   * \brief vector-scalar division operator (no scalar/vector operator)
-   * \param[in] val - value to divide elements of *this by
+   * \brief squared L2 norm of the vector (via dot with self)
+   * \return squared L2 norm
    */
-  CSysVector operator/(const ScalarType & val) const;
+  inline ScalarType squaredNorm() const { return dot(*this); }
 
   /*!
-   * \brief compound scalar division-assignment operator
-   * \param[in] val - value to divide elements of calling object by
+   * \brief L2 norm of the vector
+   * \return L2 norm
    */
-  CSysVector & operator/=(const ScalarType & val);
+  inline ScalarType norm() const { return sqrt(squaredNorm()); }
 
   /*!
    * \brief indexing operator with assignment permitted
@@ -268,32 +282,32 @@ class CSysVector {
    */
   inline const ScalarType & operator[](const unsigned long & i) const { return vec_val[i]; }
 
-  /*!
-   * \brief the L2 norm of the CSysVector
-   * \result the L2 norm
-   */
-  ScalarType norm() const;
-
   /*!
    * \brief copies the contents of the calling CSysVector into an array
    * \param[out] u_array - array into which information is being copied
    * \pre u_array must be allocated and have the same size as CSysVector
    */
-  void CopyToArray(ScalarType* u_array);
+  void CopyToArray(ScalarType* u_array) const;
 
   /*!
    * \brief Subtract val_residual to the residual.
    * \param[in] val_ipoint - index of the point where subtract the residual.
    * \param[in] val_residual - Value to subtract to the residual.
    */
-  void SubtractBlock(unsigned long val_ipoint, ScalarType *val_residual);
+  inline void SubtractBlock(unsigned long val_ipoint, const ScalarType *val_residual) {
+    for (auto iVar = 0ul; iVar < nVar; iVar++)
+      vec_val[val_ipoint*nVar+iVar] -= val_residual[iVar];
+  }
 
   /*!
    * \brief Add val_residual to the residual.
    * \param[in] val_ipoint - index of the point where add the residual.
    * \param[in] val_residual - Value to add to the residual.
    */
-  void AddBlock(unsigned long val_ipoint, ScalarType *val_residual);
+  inline void AddBlock(unsigned long val_ipoint, const ScalarType *val_residual) {
+    for (auto iVar = 0ul; iVar < nVar; iVar++)
+      vec_val[val_ipoint*nVar+iVar] += val_residual[iVar];
+  }
 
   /*!
    * \brief Set val_residual to the residual.
@@ -301,34 +315,44 @@ class CSysVector {
    * \param[in] val_var - inde of the residual to be set.
    * \param[in] val_residual - Value to set to the residual.
    */
-  void SetBlock(unsigned long val_ipoint, unsigned short val_var, ScalarType val_residual);
+  inline void SetBlock(unsigned long val_ipoint, unsigned long val_var, ScalarType val_residual) {
+    vec_val[val_ipoint*nVar+val_var] = val_residual;
+  }
 
   /*!
    * \brief Set val_residual to the residual.
    * \param[in] val_ipoint - index of the point where set the residual.
    * \param[in] val_residual - Value to set to the residual.
    */
-  void SetBlock(unsigned long val_ipoint, ScalarType *val_residual);
+  inline void SetBlock(unsigned long val_ipoint, const ScalarType *val_residual) {
+    for (auto iVar = 0ul; iVar < nVar; iVar++)
+      vec_val[val_ipoint*nVar+iVar] = val_residual[iVar];
+  }
 
   /*!
    * \brief Set the residual to zero.
    * \param[in] val_ipoint - index of the point where set the residual.
    */
-  void SetBlock_Zero(unsigned long val_ipoint);
+  inline void SetBlock_Zero(unsigned long val_ipoint) {
+    for (auto iVar = 0ul; iVar < nVar; iVar++)
+      vec_val[val_ipoint*nVar+iVar] = 0.0;
+  }
 
   /*!
    * \brief Set the velocity residual to zero.
    * \param[in] val_ipoint - index of the point where set the residual.
    * \param[in] val_var - inde of the residual to be set.
    */
-  void SetBlock_Zero(unsigned long val_ipoint, unsigned short val_var);
+  inline void SetBlock_Zero(unsigned long val_ipoint, unsigned long val_var) {
+    vec_val[val_ipoint*nVar+val_var] = 0.0;
+  }
 
   /*!
    * \brief Get the value of the residual.
    * \param[in] val_ipoint - index of the point where set the residual.
    * \return Pointer to the residual.
    */
-  ScalarType *GetBlock(unsigned long val_ipoint);
+  inline ScalarType *GetBlock(unsigned long val_ipoint) { return &vec_val[val_ipoint*nVar]; }
 
   /*!
    * \brief Get the value of the residual.
@@ -336,27 +360,7 @@ class CSysVector {
    * \param[in] val_var - inde of the residual to be set.
    * \return Value of the residual.
    */
-  ScalarType GetBlock(unsigned long val_ipoint, unsigned short val_var);
-
-  /*!
-   * \brief dot-product between two CSysVectors
-   * \param[in] u - first CSysVector in dot product
-   * \param[in] v - second CSysVector in dot product
-   */
-  friend ScalarType dotProd<ScalarType>(const CSysVector & u, const CSysVector & v);
-
-  /*!
-   * \brief Set our values (resizing if required) by copying from other, the derivative information is lost.
-   * \param[in] other - source CSysVector
-   */
-  template<class T>
-  void PassiveCopy(const CSysVector<T>& other);
+  inline ScalarType GetBlock(unsigned long val_ipoint, unsigned long val_var) const {
+    return vec_val[val_ipoint*nVar+val_var];
+  }
 };
-
-/*!
- * \brief scalar * vector multiplication operator
- * \param[in] val - scalar value to multiply by
- * \param[in] u - CSysVector having its elements scaled
- */
-template<class ScalarType>
-CSysVector<ScalarType> operator*(const ScalarType & val, const CSysVector<ScalarType> & u);
diff --git a/Common/include/mpi_structure.hpp b/Common/include/mpi_structure.hpp
index e242a8f9ff91..20f2113ecfca 100644
--- a/Common/include/mpi_structure.hpp
+++ b/Common/include/mpi_structure.hpp
@@ -118,7 +118,9 @@ class CBaseMPIWrapper {
   static void Error(std::string ErrorMsg, std::string FunctionName);
 
   static void Init(int *argc, char***argv);
-  
+
+  static void Init_thread(int *argc, char***argv, int required, int* provided);
+
   static void Buffer_attach(void *buffer, int size);
 
   static void Buffer_detach(void *buffer, int *size);
@@ -226,6 +228,8 @@ class CMediMPIWrapper: public CBaseMPIWrapper {
   
   static void Init(int *argc, char***argv);
 
+  static void Init_thread(int *argc, char***argv, int required, int* provided);
+
   static void Init_AMPI(void);
 
   static void Buffer_attach(void *buffer, int size);
@@ -356,6 +360,8 @@ class CBaseMPIWrapper {
   static void Error(std::string ErrorMsg, std::string FunctionName);
     
   static void Init(int *argc, char***argv);
+
+  static void Init_thread(int *argc, char***argv, int required, int* provided);
   
   static void Buffer_attach(void *buffer, int size);
   
diff --git a/Common/include/mpi_structure.inl b/Common/include/mpi_structure.inl
index 9ff12d749052..eab1cbe2ab50 100644
--- a/Common/include/mpi_structure.inl
+++ b/Common/include/mpi_structure.inl
@@ -131,6 +131,17 @@ inline void CBaseMPIWrapper::Init(int *argc, char ***argv) {
   winMinRankErrorInUse = true;
 }
 
+inline void CBaseMPIWrapper::Init_thread(int *argc, char ***argv, int required, int* provided) {
+  MPI_Init_thread(argc,argv,required,provided);
+  MPI_Comm_rank(currentComm, &Rank);    
+  MPI_Comm_size(currentComm, &Size);  
+
+  MinRankError = Size;
+  MPI_Win_create(&MinRankError, sizeof(int), sizeof(int), MPI_INFO_NULL,
+                 currentComm, &winMinRankError);
+  winMinRankErrorInUse = true;
+}
+
 inline void CBaseMPIWrapper::Buffer_attach(void *buffer, int size){
   MPI_Buffer_attach(buffer, size);
 }
@@ -266,7 +277,7 @@ inline void CBaseMPIWrapper::Waitany(int nrequests, Request *request,
                                  int *index, Status *status) {
   MPI_Waitany(nrequests, request, index, status);
 }
-  
+
 
 #if defined CODI_REVERSE_TYPE || defined CODI_FORWARD_TYPE
 
@@ -282,6 +293,18 @@ inline void CMediMPIWrapper::Init(int *argc, char ***argv) {
   winMinRankErrorInUse = true;
 }
 
+inline void CMediMPIWrapper::Init_thread(int *argc, char ***argv, int required, int* provided) {
+  AMPI_Init_thread(argc,argv,required,provided);
+  MediTool::init();
+  AMPI_Comm_rank(convertComm(currentComm), &Rank);    
+  AMPI_Comm_size(convertComm(currentComm), &Size);  
+
+  MinRankError = Size;
+  MPI_Win_create(&MinRankError, sizeof(int), sizeof(int), MPI_INFO_NULL,
+                 currentComm, &winMinRankError);
+  winMinRankErrorInUse = true;
+}
+
 inline void CMediMPIWrapper::Init_AMPI(void) {
   AMPI_Init_common();
   MediTool::init();
@@ -517,6 +540,8 @@ inline CBaseMPIWrapper::Comm CBaseMPIWrapper::GetComm(){
 
 inline void CBaseMPIWrapper::Init(int *argc, char ***argv) {}
 
+inline void CBaseMPIWrapper::Init_thread(int *argc, char***argv, int required, int* provided) {*provided = required;}
+
 inline void CBaseMPIWrapper::Buffer_attach(void *buffer, int size) {}
 
 inline void CBaseMPIWrapper::Buffer_detach(void *buffer, int *size) {}
diff --git a/Common/include/omp_structure.hpp b/Common/include/omp_structure.hpp
new file mode 100644
index 000000000000..25b8e49d2a02
--- /dev/null
+++ b/Common/include/omp_structure.hpp
@@ -0,0 +1,113 @@
+/*!
+ * \file omp_structure.hpp
+ * \brief OpenMP interface header, provides compatibility functions
+ *        if the code is built without OpenMP support.
+ *        Parallel pragmas are defined here so that they can be
+ *        completely "disabled" when compiling without OpenMP.
+ * \note Do not include omp.h explicitly anywhere, use this header instead.
+ * \note If you use an omp_*** function define a compatibility version here,
+ *       if that is not practical use define "HAVE_OMP" to guard that function.
+ * \note Always use the macro "SU2_OMP" to create OpenMP constructs, this is so
+ *       we can disable pragmas. Other convenient pragmas are also defined here
+ *       e.g. SU2_OMP_PARALLEL. Exotic pragmas of limited portability should be
+ *       defined here with suitable fallback versions to limit the spread of
+ *       compiler tricks in other areas of the code.
+ * \author P. Gomes
+ * \version 7.0.0 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#if defined(_MSC_VER)
+#define PRAGMIZE(X) __pragma(X)
+#else
+#define PRAGMIZE(X) _Pragma(#X)
+#endif
+
+/*--- Detect compilation with OpenMP support, protect agaisnt
+ *    using OpenMP with AD (not supported yet). ---*/
+#if defined(_OPENMP) && !defined(CODI_REVERSE_TYPE) && !defined(CODI_FORWARD_TYPE)
+#define HAVE_OMP
+#include <omp.h>
+
+/*--- The generic start of OpenMP constructs. ---*/
+#define SU2_OMP(ARGS) PRAGMIZE(omp ARGS)
+
+#else // Compile without OpenMP
+
+/*--- Disable pragmas to quiet compilation warnings. ---*/
+#define SU2_OMP(ARGS)
+
+/*!
+ * \brief Maximum number of threads available.
+ */
+inline constexpr int omp_get_max_threads(void) {return 1;}
+
+/*!
+ * \brief Index of current thread, akin to MPI rank.
+ */
+inline constexpr int omp_get_thread_num(void) {return 0;}
+
+#endif
+
+/*--- Convenience macros (do not use excessive nesting of macros). ---*/
+#define SU2_OMP_SIMD SU2_OMP(simd)
+
+#define SU2_OMP_MASTER SU2_OMP(master)
+#define SU2_OMP_BARRIER SU2_OMP(barrier)
+
+#define SU2_OMP_PARALLEL SU2_OMP(parallel)
+#define SU2_OMP_PARALLEL_(ARGS) SU2_OMP(parallel ARGS)
+#define SU2_OMP_PARALLEL_ON(NTHREADS) SU2_OMP(parallel num_threads(NTHREADS))
+
+#define SU2_OMP_FOR_DYN(CHUNK) SU2_OMP(for schedule(dynamic,CHUNK))
+#define SU2_OMP_FOR_STAT(CHUNK) SU2_OMP(for schedule(static,CHUNK))
+
+
+/*--- Convenience functions (e.g. to compute chunk sizes). ---*/
+
+/*!
+ * \brief Integer division rounding up.
+ */
+inline constexpr size_t roundUpDiv(size_t numerator, size_t denominator)
+{
+  return (numerator+denominator-1)/denominator;
+}
+
+/*!
+ * \brief Compute a chunk size based on totalWork and number of threads such that
+ *        all threads get the same number of chunks (with limited size).
+ * \param[in] totalWork - e.g. total number of loop iterations.
+ * \param[in] numThreads - Number of threads that will share the work.
+ * \param[in] maxChunkSize - Upper bound for chunk size.
+ * \return The chunkSize.
+ */
+inline size_t computeStaticChunkSize(size_t totalWork,
+                                     size_t numThreads,
+                                     size_t maxChunkSize)
+{
+  size_t workPerThread = roundUpDiv(totalWork, numThreads);
+  size_t chunksPerThread = roundUpDiv(workPerThread, maxChunkSize);
+  return roundUpDiv(workPerThread, chunksPerThread);
+}
+
diff --git a/Common/include/toolboxes/C2DContainer.hpp b/Common/include/toolboxes/C2DContainer.hpp
index 028ff7469af7..508993a48f6d 100644
--- a/Common/include/toolboxes/C2DContainer.hpp
+++ b/Common/include/toolboxes/C2DContainer.hpp
@@ -31,6 +31,7 @@
 #include "../datatype_structure.hpp"
 
 #include <utility>
+#include <type_traits>
 
 /*!
  * \enum StorageType
@@ -360,6 +361,8 @@ template<typename Index_t, class Scalar_t, StorageType Store, size_t AlignSize,
 class C2DContainer :
   public container_helpers::AccessorImpl<Index_t,Scalar_t,Store,AlignSize,StaticRows,StaticCols>
 {
+  static_assert(std::is_integral<Index_t>::value,"");
+
 private:
   using Base = container_helpers::AccessorImpl<Index_t,Scalar_t,Store,AlignSize,StaticRows,StaticCols>;
   using Base::m_data;
@@ -401,12 +404,9 @@ class C2DContainer :
       free(m_data);
     }
 
-    /*--- round up size to a multiple of the alignment specification if necessary ---*/
-    size_t bytes = reqSize*sizeof(Scalar_t);
-    size_t allocSize = (AlignSize==0)? bytes : ((bytes+AlignSize-1)/AlignSize)*AlignSize;
-
     /*--- request actual allocation to base class as it needs specialization ---*/
-    m_allocate(allocSize,rows,cols);
+    size_t bytes = reqSize*sizeof(Scalar_t);
+    m_allocate(bytes,rows,cols);
 
     return reqSize;
   }
diff --git a/Common/include/toolboxes/allocation_toolbox.hpp b/Common/include/toolboxes/allocation_toolbox.hpp
index fe334f253372..2a089b323bc4 100644
--- a/Common/include/toolboxes/allocation_toolbox.hpp
+++ b/Common/include/toolboxes/allocation_toolbox.hpp
@@ -9,7 +9,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -33,7 +33,7 @@
 #if defined(_WIN32)
 #include <malloc.h>
 #else
-#include <stdlib.h> 
+#include <stdlib.h>
 #endif
 
 #include <cassert>
@@ -46,6 +46,11 @@ inline constexpr bool is_power_of_two(size_t x)
   return x && !(x & (x-1));
 }
 
+inline constexpr size_t round_up(size_t multiple, size_t x)
+{
+  return ((x+multiple-1)/multiple)*multiple;
+}
+
 /*!
  * \brief Aligned memory allocation compatible across platforms.
  * \param[in] alignment, in bytes, of the memory being allocated.
@@ -59,6 +64,8 @@ inline T* aligned_alloc(size_t alignment, size_t size) noexcept
 
   if(alignment < alignof(void*)) alignment = alignof(void*);
 
+  size = round_up(alignment, size);
+
   void* ptr = nullptr;
 
 #if defined(__APPLE__)
diff --git a/Common/include/toolboxes/graph_toolbox.hpp b/Common/include/toolboxes/graph_toolbox.hpp
new file mode 100644
index 000000000000..b2bbdbd74da5
--- /dev/null
+++ b/Common/include/toolboxes/graph_toolbox.hpp
@@ -0,0 +1,489 @@
+/*!
+ * \file graph_toolbox.hpp
+ * \brief Functions and classes to build/represent sparse graphs or sparse patterns.
+ * \author P. Gomes
+ * \version 7.0.0 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "C2DContainer.hpp"
+
+#include <set>
+#include <vector>
+#include <limits>
+#include <cassert>
+#include <algorithm>
+
+/*!
+ * \enum ConnectivityType
+ * \brief In FVM points are connected by the edges (faces) of the grid.
+ *        In FEM, two points are connected if they have an element in common.
+ */
+enum class ConnectivityType {FiniteVolume=0, FiniteElement=1};
+
+
+/*!
+ * \class CCompressedSparsePattern
+ * \brief A simple class to store adjacency information in a
+ * compressed format suitable for sparse matrix operations.
+ * If built for row-major storage the inner indices are column indices
+ * and the pattern should be used as (row,icol), otherwise as (col,irow).
+ */
+template<typename Index_t>
+class CCompressedSparsePattern {
+  static_assert(std::is_integral<Index_t>::value,"");
+
+private:
+  su2vector<Index_t> m_outerPtr; /*!< \brief Start positions of the inner indices for each outer index. */
+  su2vector<Index_t> m_innerIdx; /*!< \brief Inner indices of the non zero entries. */
+  su2vector<Index_t> m_diagPtr;  /*!< \brief Position of the diagonal entry. */
+
+public:
+  using IndexType = Index_t;
+
+  CCompressedSparsePattern() = default;
+
+  /*!
+   * \brief Construct from rvalue refs.
+   * \note This is the most efficient constructor as no data copy occurs.
+   * \param[in] outerPtr - Outer index pointers.
+   * \param[in] innerIdx - Inner indices.
+   */
+  CCompressedSparsePattern(su2vector<Index_t>&& outerPtr,
+                           su2vector<Index_t>&& innerIdx) :
+    m_outerPtr(outerPtr), m_innerIdx(innerIdx)
+  {
+    /*--- perform a basic sanity check ---*/
+    assert(m_innerIdx.size() == m_outerPtr(m_outerPtr.size()-1));
+  }
+
+  /*!
+   * \brief Construct from vector-like objects of any type with
+   *        methods "size()" and "data()" (returning a pointer).
+   * \param[in] outerPtr - Outer index pointers.
+   * \param[in] innerIdx - Inner indices.
+   */
+  template<class T>
+  CCompressedSparsePattern(const T& outerPtr, const T& innerIdx)
+  {
+    m_outerPtr.resize(outerPtr.size());
+    for(Index_t i=0; i<outerPtr.size(); ++i)
+      m_outerPtr(i) = outerPtr.data()[i];
+
+    m_innerIdx.resize(innerIdx.size());
+    for(Index_t i=0; i<innerIdx.size(); ++i)
+      m_innerIdx(i) = innerIdx.data()[i];
+
+    /*--- perform a basic sanity check ---*/
+    assert(m_innerIdx.size() == m_outerPtr(m_outerPtr.size()-1));
+  }
+
+  /*!
+   * \brief Build a list of pointers to the diagonal entries of the pattern.
+   */
+  void buildDiagPtr() {
+    if(!m_diagPtr.empty()) return;
+
+    m_diagPtr.resize(getOuterSize());
+    for(Index_t k = 0; k < getOuterSize(); ++k)
+      m_diagPtr(k) = findInnerIdx(k,k);
+  }
+
+  /*!
+   * \return True if the pattern is empty, i.e. has not been built yet.
+   */
+  inline bool empty() const {
+    return m_outerPtr.empty() || m_innerIdx.empty();
+  }
+
+  /*!
+   * \return Number of rows/columns.
+   */
+  inline Index_t getOuterSize() const {
+    return m_outerPtr.size()-1;
+  }
+
+  /*!
+   * \return Number of non zero entries.
+   */
+  inline Index_t getNumNonZeros() const {
+    return m_innerIdx.size();
+  }
+
+  /*!
+   * \param[in] iOuterIdx - Outer index.
+   * \return Number of inner indices associated with the outer index.
+   */
+  inline Index_t getNumNonZeros(Index_t iOuterIdx) const {
+    return m_outerPtr(iOuterIdx+1) - m_outerPtr(iOuterIdx);
+  }
+
+  /*!
+   * \param[in] iOuterIdx - Outer index.
+   * \param[in] iNonZero - Relative position of the inner index.
+   * \return The index of the i'th inner index associated with the outer index.
+   */
+  inline Index_t getInnerIdx(Index_t iOuterIdx, Index_t iNonZero) const {
+    assert(iNonZero >= 0 && iNonZero < getNumNonZeros(iOuterIdx));
+    return m_innerIdx(m_outerPtr(iOuterIdx) + iNonZero);
+  }
+
+  /*!
+   * \param[in] iOuterIdx - Outer index (row/col).
+   * \param[in] iInnerIdx - Inner index (col/row).
+   * \return Absolute position of non zero entry (iOuterIdx,iInnerIdx),
+   *         or NNZ if position does not belong to the pattern.
+   */
+  inline Index_t findInnerIdx(Index_t iOuterIdx, Index_t iInnerIdx) const {
+    for(Index_t k = m_outerPtr(iOuterIdx); k < m_outerPtr(iOuterIdx+1); ++k)
+      if(m_innerIdx(k) == iInnerIdx) return k;
+    return m_innerIdx.size();
+  }
+
+  /*!
+   * \param[in] iOuterIdx - Outer index (row/col).
+   * \param[in] iInnerIdx - Inner index (col/row).
+   * \return True if (iOuterIdx,iInnerIdx) exists, i.e. is non zero.
+   */
+  inline bool isNonZero(Index_t iOuterIdx, Index_t iInnerIdx) const {
+    return findInnerIdx(iOuterIdx, iInnerIdx) < m_innerIdx.size();
+  }
+
+  /*!
+   * \param[in] iOuterIdx - Outer index (row/col).
+   * \param[in] iInnerIdx - Inner index (col/row).
+   * \return Absolute position of non zero entry (iOuterIdx,iInnerIdx).
+   * \note This method is only safe if the entry exists.
+   */
+  inline Index_t quickFindInnerIdx(Index_t iOuterIdx, Index_t iInnerIdx) const {
+    assert(isNonZero(iOuterIdx, iInnerIdx) && "Error, j does not belong to NZ(i).");
+    Index_t k = m_outerPtr(iOuterIdx);
+    while(m_innerIdx(k) != iInnerIdx) ++k;
+    return k;
+  }
+
+  /*!
+   * \param[in] iDiagIdx - Diagonal index (row == col).
+   * \return Absolute position of the diagonal entry.
+   */
+  inline Index_t getDiagPtr(Index_t iDiagIdx) const {
+    return m_diagPtr(iDiagIdx);
+  }
+
+  /*!
+   * \return Raw pointer to the outer pointer vector.
+   */
+  inline const Index_t* outerPtr() const {
+    assert(!empty() && "Sparse pattern has not been built.");
+    return m_outerPtr.data();
+  }
+
+  /*!
+   * \return Raw pointer to the inner index vector.
+   */
+  inline const Index_t* innerIdx() const {
+    assert(!empty() && "Sparse pattern has not been built.");
+    return m_innerIdx.data();
+  }
+
+  /*!
+   * \return Raw pointer to the diagonal pointer vector.
+   */
+  inline const Index_t* diagPtr() const {
+    assert(!m_diagPtr.empty() && "Diagonal map has not been built.");
+    return m_diagPtr.data();
+  }
+
+  /*!
+   * \return The minimum inner index.
+   */
+  Index_t getMinInnerIdx() const {
+    Index_t idx = std::numeric_limits<Index_t>::max();
+    for(Index_t k=0; k<m_innerIdx.size(); ++k)
+      idx = std::min(idx, m_innerIdx(k));
+    return idx;
+  }
+
+  /*!
+   * \return The maximum inner index.
+   */
+  Index_t getMaxInnerIdx() const {
+    Index_t idx = std::numeric_limits<Index_t>::min();
+    for(Index_t k=0; k<m_innerIdx.size(); ++k)
+      idx = std::max(idx, m_innerIdx(k));
+    return idx;
+  }
+};
+
+/*!
+ * \brief Alias a type of container as the edge map class, this is a N by 2 container
+ *        that maps the two sparse entries referenced by an edge (ij and ji) to two
+ *        non zero entries of a sparse pattern.
+ */
+template<typename Index_t>
+using CEdgeToNonZeroMap = C2DContainer<unsigned long, Index_t, StorageType::RowMajor, 64, DynamicSize, 2>;
+
+
+using CCompressedSparsePatternUL = CCompressedSparsePattern<unsigned long>;
+using CEdgeToNonZeroMapUL = CEdgeToNonZeroMap<unsigned long>;
+
+
+/*!
+ * \brief Build a sparse pattern from geometry information, of type FVM or FEM,
+ *        for a given fill-level. At fill-level N, the immediate neighbors of the
+ *        points in level N-1 are also considered neighbors of the base point.
+ *        The resulting pattern is that of A^{N+1} where A is the sparse matrix
+ *        of immediate neighbors.
+ * \note Algorithm is equivalent to the implementation by F. Palacios,
+ *       A. Bueno, and T. Economon from CSysMatrix.
+ * \param[in] geometry - Definition of the grid.
+ * \param[in] type - Of connectivity.
+ * \param[in] fillLvl - Target degree of neighborhood (immediate neighbors always added).
+ * \return Compressed-Storage-Row sparse pattern.
+ */
+template<class Geometry_t, typename Index_t>
+CCompressedSparsePattern<Index_t> buildCSRPattern(Geometry_t& geometry,
+                                                  ConnectivityType type,
+                                                  Index_t fillLvl)
+{
+  Index_t nPoint = geometry.GetnPoint();
+
+  std::vector<Index_t> outerPtr(nPoint+1);
+  std::vector<Index_t> innerIdx;
+  innerIdx.reserve(nPoint); // at least this much space is needed
+
+  for(Index_t iPoint = 0; iPoint < nPoint; ++iPoint)
+  {
+    /*--- Inner indices for iPoint start here. ---*/
+    outerPtr[iPoint] = innerIdx.size();
+
+    /*--- Use a set to avoid duplication and keep ascending order. ---*/
+    std::set<Index_t> neighbors;
+
+    /*--- Insert base point. ---*/
+    neighbors.insert(iPoint);
+
+    /*--- Neighbors added in previous level. ---*/
+    std::set<Index_t> addedNeighbors(neighbors);
+
+    for(Index_t iLevel = 0; ; ++iLevel)
+    {
+      /*--- New points added in this level. ---*/
+      std::set<Index_t> newNeighbors;
+
+      /*--- For each point previously added, add its level 0
+       *    neighbors, not duplicating any existing neighbor. ---*/
+      for(auto jPoint : addedNeighbors)
+      {
+        auto point = geometry.node[jPoint];
+
+        if(type == ConnectivityType::FiniteVolume)
+        {
+          /*--- For FVM we know the neighbors of point j directly. ---*/
+          for(unsigned short iNeigh = 0; iNeigh < point->GetnPoint(); ++iNeigh)
+          {
+            Index_t kPoint = point->GetPoint(iNeigh);
+
+            if(neighbors.count(kPoint) == 0) // no duplication
+              newNeighbors.insert(kPoint);
+          }
+        }
+        else // FiniteElement
+        {
+          /*--- For FEM we need the nodes of all elements that contain point j. ---*/
+          for(unsigned short iNeigh = 0; iNeigh < point->GetnElem(); ++iNeigh)
+          {
+            auto elem = geometry.elem[point->GetElem(iNeigh)];
+
+            for(unsigned short iNode = 0; iNode < elem->GetnNodes(); ++iNode)
+            {
+              Index_t kPoint = elem->GetNode(iNode);
+
+              if(neighbors.count(kPoint) == 0) // no duplication
+                newNeighbors.insert(kPoint);
+            }
+          }
+        }
+      }
+
+      neighbors.insert(newNeighbors.begin(), newNeighbors.end());
+
+      if(iLevel >= fillLvl) break;
+
+      /*--- For the next level we get the neighbours of the new points. ---*/
+      addedNeighbors = newNeighbors;
+    }
+
+    /*--- Store final sparse pattern for iPoint. ---*/
+    innerIdx.insert(innerIdx.end(), neighbors.begin(), neighbors.end());
+  }
+  outerPtr.back() = innerIdx.size();
+
+  /*--- Return pattern as CCompressedSparsePattern object. ---*/
+  return CCompressedSparsePattern<Index_t>(outerPtr, innerIdx);
+}
+
+
+/*!
+ * \brief Build a lookup table of the absolute positions of the non zero entries
+ *        of a compressed sparse pattern, accessed when visiting the FVM edges
+ *        of a grid. The table can then be used for fast access (avoids searches)
+ *        to the non zero entries of a sparse matrix associated with the pattern.
+ * \param[in] geometry - Definition of the grid.
+ * \param[in] pattern - Sparse pattern.
+ * \return nEdge by 2 matrix.
+ */
+template<class Geometry_t, typename Index_t>
+CEdgeToNonZeroMap<Index_t> mapEdgesToSparsePattern(Geometry_t& geometry,
+                                                   const CCompressedSparsePattern<Index_t>& pattern)
+{
+  assert(!pattern.empty());
+
+  CEdgeToNonZeroMap<Index_t> edgeMap(geometry.GetnEdge(),2);
+
+  for(Index_t iEdge = 0; iEdge < geometry.GetnEdge(); ++iEdge)
+  {
+    Index_t iPoint = geometry.edge[iEdge]->GetNode(0);
+    Index_t jPoint = geometry.edge[iEdge]->GetNode(1);
+
+    edgeMap(iEdge,0) = pattern.quickFindInnerIdx(iPoint,jPoint);
+    edgeMap(iEdge,1) = pattern.quickFindInnerIdx(jPoint,iPoint);
+  }
+
+  return edgeMap;
+}
+
+
+/*!
+ * \brief Color contiguous groups of outer indices of a sparse pattern such that
+ *        within each color, any two groups do not have inner indices in common.
+ * \note  Within a group, two outer indices will generally have common inner indices.
+ *        The coloring is returned as a compressed sparse pattern where the colors
+ *        are outer indices, and the outer indices of the input pattern are the
+ *        inner indices of the coloring. A simple greedy algorithm is used.
+ *        Using a sparse pattern as input allows "anything" to be colored e.g.
+ *        FVM edges, FEM elements, the rows/columns of a sparse matrix, etc.
+ * \note  The worst that can happen in this method is needing an unreasonable number
+ *        of colors, or too much memory due to a large range of the inner indices.
+ *        The last two template parameters limit both, in case of failure an empty
+ *        pattern is returned.
+ * \param[in] pattern - Sparse pattern to be colored.
+ * \param[in] groupSize - Size of the outer index groups, default 1.
+ * \param[out] indexColor - Optional, vector with colors given to the outer indices.
+ * \return Coloring in the same type of the input pattern.
+ */
+template<class T, typename Color_t = char, size_t MaxColors = 64, size_t MaxMB = 128>
+T colorSparsePattern(const T& pattern, size_t groupSize = 1,
+                     std::vector<Color_t>* indexColor = nullptr)
+{
+  static_assert(std::is_integral<Color_t>::value,"");
+  static_assert(std::numeric_limits<Color_t>::max() >= MaxColors,"");
+
+  using Index_t = typename T::IndexType;
+
+  const Index_t grpSz = groupSize;
+  const Index_t nOuter = pattern.getOuterSize();
+  const Index_t minIdx = pattern.getMinInnerIdx();
+  const Index_t nInner = pattern.getMaxInnerIdx()+1-minIdx;
+
+  /*--- Check the max memory condition (<< 23 is to count bits). ---*/
+  if(size_t(nInner) > (MaxMB << 23)) return T();
+
+  /*--- Vector with the color given to each outer index. ---*/
+  std::vector<Color_t> idxColor(nOuter);
+
+  /*--- Start with one color, with no indices assigned. ---*/
+  std::vector<Index_t> colorSize(1,0);
+  Color_t color, nColor = 1;
+
+  {
+  /*--- For each color keep track of the inner indices that are in it. ---*/
+  std::vector<std::vector<bool> > innerInColor;
+  innerInColor.emplace_back(nInner, false);
+
+  auto outerPtr = pattern.outerPtr();
+  auto innerIdx = pattern.innerIdx();
+
+  for(Index_t iOuter = 0; iOuter < nOuter; iOuter += grpSz)
+  {
+    Index_t grpEnd = std::min(iOuter+grpSz, nOuter);
+
+    for(color = 0; color < nColor; ++color)
+    {
+      bool free = true;
+      /*--- Traverse entire group as a large outer index. ---*/
+      for(Index_t k = outerPtr[iOuter]; k < outerPtr[grpEnd] && free; ++k)
+      {
+        free = !innerInColor[color][innerIdx[k]-minIdx];
+      }
+      /*--- If none of the inner indices in the group appears in
+       *    this color yet, it is assigned to the group. ---*/
+      if(free) break;
+    }
+
+    /*--- No color was free, make space for a new one. ---*/
+    if(color == nColor)
+    {
+      ++nColor;
+      if(nColor == MaxColors) return T();
+      colorSize.push_back(0);
+      innerInColor.emplace_back(nInner, false);
+    }
+
+    /*--- Assign color to group. ---*/
+    for(Index_t k = iOuter; k < grpEnd; ++k) idxColor[k] = color;
+
+    /*--- Mark the inner indices of the group as belonging to the color. ---*/
+    for(Index_t k = outerPtr[iOuter]; k < outerPtr[grpEnd]; ++k)
+    {
+      innerInColor[color][innerIdx[k]-minIdx] = true;
+    }
+
+    /*--- Update count for the assigned color. ---*/
+    colorSize[color] += grpEnd - iOuter;
+  }
+  } // matrix of bools goes out of scope
+
+
+  /*--- Compress the coloring information. ---*/
+
+  su2vector<Index_t> colorPtr(nColor+1); colorPtr(0) = 0;
+  su2vector<Index_t> outerIdx(nOuter);
+
+  Index_t k = 0;
+  for(color = 0; color < nColor; ++color)
+  {
+    colorPtr(color+1) = colorPtr(color)+colorSize[color];
+
+    for(Index_t iOuter = 0; iOuter < nOuter; ++iOuter)
+      if(idxColor[iOuter] == color)
+        outerIdx(k++) = iOuter;
+  }
+
+  /*--- Optional return of the direct color information. ---*/
+  if(indexColor) *indexColor = std::move(idxColor);
+
+  /*--- Move compressed coloring into result pattern instance. ---*/
+  return T(std::move(colorPtr), std::move(outerIdx));
+}
diff --git a/Common/src/geometry/CGeometry.cpp b/Common/src/geometry/CGeometry.cpp
index f5dc732d581d..6cd8bfee5862 100644
--- a/Common/src/geometry/CGeometry.cpp
+++ b/Common/src/geometry/CGeometry.cpp
@@ -3955,3 +3955,87 @@ void CGeometry::SetGridVelocity(CConfig *config, unsigned long iter) {
   }
 
 }
+
+const CCompressedSparsePatternUL& CGeometry::GetSparsePattern(ConnectivityType type, unsigned long fillLvl)
+{
+  bool fvm = (type == ConnectivityType::FiniteVolume);
+
+  CCompressedSparsePatternUL* pattern = nullptr;
+
+  if (fillLvl == 0)
+    pattern = fvm? &finiteVolumeCSRFill0 : &finiteElementCSRFill0;
+  else
+    pattern = fvm? &finiteVolumeCSRFillN : &finiteElementCSRFillN;
+
+  if (pattern->empty()) {
+    *pattern = buildCSRPattern(*this, type, fillLvl);
+    pattern->buildDiagPtr();
+  }
+
+  return *pattern;
+}
+
+const CEdgeToNonZeroMapUL& CGeometry::GetEdgeToSparsePatternMap(void)
+{
+  if (edgeToCSRMap.empty()) {
+    if (finiteVolumeCSRFill0.empty()) {
+      finiteVolumeCSRFill0 = buildCSRPattern(*this, ConnectivityType::FiniteVolume, 0ul);
+    }
+    edgeToCSRMap = mapEdgesToSparsePattern(*this, finiteVolumeCSRFill0);
+  }
+  return edgeToCSRMap;
+}
+
+const CCompressedSparsePatternUL& CGeometry::GetEdgeColoring(void)
+{
+  if (edgeColoring.empty()) {
+    /*--- Create a temporary sparse pattern from the edges. ---*/
+    /// TODO: Try to avoid temporary once grid information is made contiguous.
+    su2vector<unsigned long> outerPtr(nEdge+1);
+    su2vector<unsigned long> innerIdx(nEdge*2);
+
+    for(unsigned long iEdge = 0; iEdge < nEdge; ++iEdge) {
+      outerPtr(iEdge) = 2*iEdge;
+      innerIdx(iEdge*2+0) = edge[iEdge]->GetNode(0);
+      innerIdx(iEdge*2+1) = edge[iEdge]->GetNode(1);
+    }
+    outerPtr(nEdge) = 2*nEdge;
+
+    CCompressedSparsePatternUL pattern(move(outerPtr), move(innerIdx));
+
+    /*--- Color the edges. ---*/
+    edgeColoring = colorSparsePattern(pattern, edgeColorGroupSize);
+
+    if(edgeColoring.empty())
+      SU2_MPI::Error("Edge coloring failed.", CURRENT_FUNCTION);
+  }
+  return edgeColoring;
+}
+
+const CCompressedSparsePatternUL& CGeometry::GetElementColoring(void)
+{
+  if (elemColoring.empty()) {
+    /*--- Create a temporary sparse pattern from the elements. ---*/
+    /// TODO: Try to avoid temporary once grid information is made contiguous.
+    vector<unsigned long> outerPtr(nElem+1);
+    vector<unsigned long> innerIdx; innerIdx.reserve(nElem);
+
+    for(unsigned long iElem = 0; iElem < nElem; ++iElem) {
+      outerPtr[iElem] = innerIdx.size();
+
+      for(unsigned short iNode = 0; iNode < elem[iElem]->GetnNodes(); ++iNode) {
+        innerIdx.push_back(elem[iElem]->GetNode(iNode));
+      }
+    }
+    outerPtr[nElem] = innerIdx.size();
+
+    CCompressedSparsePatternUL pattern(outerPtr, innerIdx);
+
+    /*--- Color the elements. ---*/
+    elemColoring = colorSparsePattern(pattern, elemColorGroupSize);
+
+    if(elemColoring.empty())
+      SU2_MPI::Error("Element coloring failed.", CURRENT_FUNCTION);
+  }
+  return elemColoring;
+}
diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
index 1bf475065233..93f63d4c121c 100644
--- a/Common/src/geometry/CPhysicalGeometry.cpp
+++ b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -9660,12 +9660,12 @@ void CPhysicalGeometry::SetSensitivity(CConfig *config) {
   string filename = config->GetSolution_AdjFileName();
 
   su2double AoASens;
-  unsigned short nTimeIter, iDim;
-  unsigned long iPoint, index;
+  unsigned short nTimeIter;
+  unsigned long index;
   string::size_type position;
   int counter = 0;
 
-  Sensitivity = new su2double[nPoint*nDim];
+  Sensitivity.resize(nPoint,nDim) = su2double(0.0);
 
   if (config->GetTime_Domain()) {
     nTimeIter = config->GetnTime_Iter();
@@ -9679,13 +9679,6 @@ void CPhysicalGeometry::SetSensitivity(CConfig *config) {
   /*--- Read all lines in the restart file ---*/
   long iPoint_Local; unsigned long iPoint_Global = 0; string text_line;
 
-
-  for (iPoint = 0; iPoint < nPoint; iPoint++) {
-    for (iDim = 0; iDim < nDim; iDim++) {
-      Sensitivity[iPoint*nDim+iDim] = 0.0;
-    }
-  }
-
   iPoint_Global = 0;
 
   filename = config->GetSolution_AdjFileName();
@@ -9999,13 +9992,13 @@ void CPhysicalGeometry::SetSensitivity(CConfig *config) {
          offset in the buffer of data from the restart file and load it. ---*/
 
         index = counter*nFields + sens_x_idx - 1;
-        Sensitivity[iPoint_Local*nDim+0] = Restart_Data[index];
+        Sensitivity(iPoint_Local,0) = Restart_Data[index];
         index = counter*nFields + sens_y_idx - 1;
-        Sensitivity[iPoint_Local*nDim+1] = Restart_Data[index];
+        Sensitivity(iPoint_Local,1) = Restart_Data[index];
 
         if (nDim == 3){
           index = counter*nFields + sens_z_idx - 1;
-          Sensitivity[iPoint_Local*nDim+2] = Restart_Data[index];
+          Sensitivity(iPoint_Local,2) = Restart_Data[index];
         }
         /*--- Increment the overall counter for how many points have been loaded. ---*/
         counter++;
@@ -10150,11 +10143,10 @@ void CPhysicalGeometry::SetSensitivity(CConfig *config) {
     iPoint_Local = GetGlobal_to_Local_Point(iPoint_Global);
 
     if (iPoint_Local > -1) {
-      Sensitivity[iPoint_Local*nDim+0] = PrintingToolbox::stod(point_line[sens_x_idx]);
-      Sensitivity[iPoint_Local*nDim+1] = PrintingToolbox::stod(point_line[sens_y_idx]);
+      Sensitivity(iPoint_Local,0) = PrintingToolbox::stod(point_line[sens_x_idx]);
+      Sensitivity(iPoint_Local,1) = PrintingToolbox::stod(point_line[sens_y_idx]);
       if (nDim == 3)
-        Sensitivity[iPoint_Local*nDim+2] = PrintingToolbox::stod(point_line[sens_z_idx]);
-
+        Sensitivity(iPoint_Local,2) = PrintingToolbox::stod(point_line[sens_z_idx]);
     }
 
   }
@@ -10209,12 +10201,7 @@ void CPhysicalGeometry::ReadUnorderedSensitivity(CConfig *config) {
 
   /*--- Allocate space for the sensitivity and initialize. ---*/
 
-  Sensitivity = new su2double[nPoint*nDim];
-  for (iPoint = 0; iPoint < nPoint; iPoint++) {
-    for (iDim = 0; iDim < nDim; iDim++) {
-      Sensitivity[iPoint*nDim+iDim] = 0.0;
-    }
-  }
+  Sensitivity.resize(nPoint,nDim) = su2double(0.0);
 
   /*--- Get the filename for the unordered ASCII sensitivity file input. ---*/
 
@@ -10287,7 +10274,7 @@ void CPhysicalGeometry::ReadUnorderedSensitivity(CConfig *config) {
           /*--- Store the sensitivities at the matched local node. ---*/
 
           for (iDim = 0; iDim < nDim; iDim++)
-            Sensitivity[pointID*nDim+iDim] = Sens_External[iDim];
+            Sensitivity(pointID,iDim) = Sens_External[iDim];
 
           /*--- Keep track of how many points we match. ---*/
 
diff --git a/Common/src/grid_movement_structure.cpp b/Common/src/grid_movement_structure.cpp
index 8aafaea4708b..097844954f1e 100644
--- a/Common/src/grid_movement_structure.cpp
+++ b/Common/src/grid_movement_structure.cpp
@@ -29,6 +29,9 @@
 #include "../include/adt_structure.hpp"
 #include <list>
 
+#include "../include/linear_algebra/CMatrixVectorProduct.hpp"
+#include "../include/linear_algebra/CPreconditioner.hpp"
+
 using namespace std;
 
 CGridMovement::CGridMovement(void) { }
@@ -205,13 +208,13 @@ void CVolumetricMovement::SetVolume_Deformation(CGeometry *geometry, CConfig *co
         if ((rank == MASTER_NODE) && Screen_Output) cout << "\n# ILU preconditioner." << endl;
     		StiffMatrix.BuildILUPreconditioner();
     		mat_vec = new CSysMatrixVectorProduct<su2double>(StiffMatrix, geometry, config);
-    		precond = new CILUPreconditioner<su2double>(StiffMatrix, geometry, config);
+    		precond = new CILUPreconditioner<su2double>(StiffMatrix, geometry, config, false);
     	}
     	if (config->GetKind_Deform_Linear_Solver_Prec() == JACOBI) {
         if ((rank == MASTER_NODE) && Screen_Output) cout << "\n# Jacobi preconditioner." << endl;
     		StiffMatrix.BuildJacobiPreconditioner();
     		mat_vec = new CSysMatrixVectorProduct<su2double>(StiffMatrix, geometry, config);
-    		precond = new CJacobiPreconditioner<su2double>(StiffMatrix, geometry, config);
+    		precond = new CJacobiPreconditioner<su2double>(StiffMatrix, geometry, config, false);
     	}
 
     } else if (Derivative && (config->GetKind_SU2() == SU2_DOT)) {
@@ -223,13 +226,13 @@ void CVolumetricMovement::SetVolume_Deformation(CGeometry *geometry, CConfig *co
         if ((rank == MASTER_NODE) && Screen_Output) cout << "\n# ILU preconditioner." << endl;
     		StiffMatrix.BuildILUPreconditioner(true);
     		mat_vec = new CSysMatrixVectorProductTransposed<su2double>(StiffMatrix, geometry, config);
-    		precond = new CILUPreconditioner<su2double>(StiffMatrix, geometry, config);
+    		precond = new CILUPreconditioner<su2double>(StiffMatrix, geometry, config, true);
     	}
     	if (config->GetKind_Deform_Linear_Solver_Prec() == JACOBI) {
         if ((rank == MASTER_NODE) && Screen_Output) cout << "\n# Jacobi preconditioner." << endl;
     		StiffMatrix.BuildJacobiPreconditioner(true);
     		mat_vec = new CSysMatrixVectorProductTransposed<su2double>(StiffMatrix, geometry, config);
-    		precond = new CJacobiPreconditioner<su2double>(StiffMatrix, geometry, config);
+    		precond = new CJacobiPreconditioner<su2double>(StiffMatrix, geometry, config, true);
     	}
 
     }
@@ -243,7 +246,7 @@ void CVolumetricMovement::SetVolume_Deformation(CGeometry *geometry, CConfig *co
 
           Tot_Iter = 0; MaxIter = RestartIter;
 
-          System.FGMRES_LinSolver(LinSysRes, LinSysSol, *mat_vec, *precond, NumError, 1, &Residual_Init, false, config);
+          System.FGMRES_LinSolver(LinSysRes, LinSysSol, *mat_vec, *precond, NumError, 1, Residual_Init, false, config);
 
           if ((rank == MASTER_NODE) && Screen_Output) {
             cout << "\n# FGMRES (with restart) residual history" << endl;
@@ -258,7 +261,7 @@ void CVolumetricMovement::SetVolume_Deformation(CGeometry *geometry, CConfig *co
             if (IterLinSol + RestartIter > Smoothing_Iter)
               MaxIter = Smoothing_Iter - IterLinSol;
 
-            IterLinSol = System.FGMRES_LinSolver(LinSysRes, LinSysSol, *mat_vec, *precond, NumError, MaxIter, &Residual, false, config);
+            IterLinSol = System.FGMRES_LinSolver(LinSysRes, LinSysSol, *mat_vec, *precond, NumError, MaxIter, Residual, false, config);
             Tot_Iter += IterLinSol;
 
             if ((rank == MASTER_NODE) && Screen_Output) { cout << "     " << Tot_Iter << "     " << Residual/Residual_Init << endl; }
@@ -278,7 +281,7 @@ void CVolumetricMovement::SetVolume_Deformation(CGeometry *geometry, CConfig *co
 
         case FGMRES:
 
-          Tot_Iter = System.FGMRES_LinSolver(LinSysRes, LinSysSol, *mat_vec, *precond, NumError, Smoothing_Iter, &Residual, Screen_Output, config);
+          Tot_Iter = System.FGMRES_LinSolver(LinSysRes, LinSysSol, *mat_vec, *precond, NumError, Smoothing_Iter, Residual, Screen_Output, config);
 
           break;
 
@@ -286,14 +289,14 @@ void CVolumetricMovement::SetVolume_Deformation(CGeometry *geometry, CConfig *co
 
         case BCGSTAB:
 
-          Tot_Iter = System.BCGSTAB_LinSolver(LinSysRes, LinSysSol, *mat_vec, *precond, NumError, Smoothing_Iter, &Residual, Screen_Output, config);
+          Tot_Iter = System.BCGSTAB_LinSolver(LinSysRes, LinSysSol, *mat_vec, *precond, NumError, Smoothing_Iter, Residual, Screen_Output, config);
 
           break;
 
 
         case CONJUGATE_GRADIENT:
 
-          Tot_Iter = System.CG_LinSolver(LinSysRes, LinSysSol, *mat_vec, *precond, NumError, Smoothing_Iter, &Residual, Screen_Output, config);
+          Tot_Iter = System.CG_LinSolver(LinSysRes, LinSysSol, *mat_vec, *precond, NumError, Smoothing_Iter, Residual, Screen_Output, config);
 
           break;
 
diff --git a/Common/src/linear_algebra/CPastixWrapper.cpp b/Common/src/linear_algebra/CPastixWrapper.cpp
index a8ce422f81c9..d632ecb4ea46 100644
--- a/Common/src/linear_algebra/CPastixWrapper.cpp
+++ b/Common/src/linear_algebra/CPastixWrapper.cpp
@@ -7,7 +7,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -28,7 +28,12 @@
 
 #ifdef HAVE_PASTIX
 
+#include "../../include/mpi_structure.hpp"
+#include "../../include/omp_structure.hpp"
+#include "../../include/config_structure.hpp"
+#include "../../include/geometry/CGeometry.hpp"
 #include "../../include/linear_algebra/CPastixWrapper.hpp"
+
 #include<numeric>
 
 void CPastixWrapper::Initialize(CGeometry *geometry, CConfig *config) {
@@ -84,6 +89,8 @@ void CPastixWrapper::Initialize(CGeometry *geometry, CConfig *config) {
   iparm[IPARM_ORDERING]            = API_ORDER_PTSCOTCH;
   iparm[IPARM_INCOMPLETE]          = incomplete;
   iparm[IPARM_LEVEL_OF_FILL]       = pastix_int_t(config->GetPastixFillLvl());
+  iparm[IPARM_THREAD_COMM_MODE]    = API_THREAD_FUNNELED;
+  iparm[IPARM_THREAD_NBR]          = omp_get_max_threads();
 
   /*--- Prepare sparsity structure ---*/
 
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 4d114f5799ca..7d3e5d3287d7 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -1,12 +1,12 @@
 /*!
- * \file matrix_structure.cpp
- * \brief Main subroutines for doing the sparse structures
+ * \file CSysMatrix.cpp
+ * \brief Implementation of the sparse matrix class.
  * \author F. Palacios, A. Bueno, T. Economon
  * \version 7.0.0 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -27,37 +27,44 @@
 
 #include "../../include/linear_algebra/CSysMatrix.inl"
 
+#include "../../include/geometry/CGeometry.hpp"
+#include "../../include/config_structure.hpp"
+#include "../../include/omp_structure.hpp"
+#include "../../include/toolboxes/allocation_toolbox.hpp"
+
+#include <cmath>
+
 template<class ScalarType>
 CSysMatrix<ScalarType>::CSysMatrix(void) {
 
   size = SU2_MPI::GetSize();
   rank = SU2_MPI::GetRank();
 
-  ilu_fill_in       = 0;
+  nPoint = nPointDomain = nVar = nEqn = 0;
+  nnz = nnz_ilu = 0;
+  ilu_fill_in = 0;
+  nLinelet = 0;
+
+  omp_partitions    = nullptr;
 
-  /*--- Array initialization ---*/
+  matrix            = nullptr;
+  row_ptr           = nullptr;
+  dia_ptr           = nullptr;
+  col_ind           = nullptr;
 
-  matrix            = NULL;
-  ILU_matrix        = NULL;
-  row_ptr           = NULL;
-  col_ind           = NULL;
-  row_ptr_ilu       = NULL;
-  col_ind_ilu       = NULL;
-  block             = NULL;
-  prod_row_vector   = NULL;
-  aux_vector        = NULL;
-  sum_vector        = NULL;
-  invM              = NULL;
-  block_weight      = NULL;
-  block_inverse     = NULL;
+  ILU_matrix        = nullptr;
+  row_ptr_ilu       = nullptr;
+  dia_ptr_ilu       = nullptr;
+  col_ind_ilu       = nullptr;
+
+  invM              = nullptr;
 
 #ifdef USE_MKL
-  MatrixMatrixProductJitter              = NULL;
-  MatrixVectorProductJitterBetaOne       = NULL;
-  MatrixVectorProductJitterBetaZero      = NULL;
-  MatrixVectorProductJitterAlphaMinusOne = NULL;
-  MatrixVectorProductTranspJitterBetaOne = NULL;
-  mkl_ipiv = NULL;
+  MatrixMatrixProductJitter              = nullptr;
+  MatrixVectorProductJitterBetaOne       = nullptr;
+  MatrixVectorProductJitterBetaZero      = nullptr;
+  MatrixVectorProductJitterAlphaMinusOne = nullptr;
+  MatrixVectorProductTranspJitterBetaOne = nullptr;
 #endif
 
 }
@@ -65,124 +72,122 @@ CSysMatrix<ScalarType>::CSysMatrix(void) {
 template<class ScalarType>
 CSysMatrix<ScalarType>::~CSysMatrix(void) {
 
-  /*--- Memory deallocation ---*/
-
-  if (matrix != NULL)             delete [] matrix;
-  if (ILU_matrix != NULL)         delete [] ILU_matrix;
-  if (row_ptr != NULL)            delete [] row_ptr;
-  if (col_ind != NULL)            delete [] col_ind;
-
-  if (ilu_fill_in != 0) {
-    if (row_ptr_ilu != NULL) delete [] row_ptr_ilu;
-    if (col_ind_ilu != NULL) delete [] col_ind_ilu;
-  }
-
-  if (block != NULL)              delete [] block;
-  if (block_weight != NULL)       delete [] block_weight;
-  if (block_inverse != NULL)      delete [] block_inverse;
-
-  if (prod_row_vector != NULL)    delete [] prod_row_vector;
-  if (aux_vector != NULL)         delete [] aux_vector;
-  if (sum_vector != NULL)         delete [] sum_vector;
-  if (invM != NULL)               delete [] invM;
+  if (omp_partitions != nullptr) delete [] omp_partitions;
+  if (ILU_matrix != nullptr) MemoryAllocation::aligned_free(ILU_matrix);
+  if (matrix != nullptr) MemoryAllocation::aligned_free(matrix);
+  if (invM != nullptr) MemoryAllocation::aligned_free(invM);
 
 #ifdef USE_MKL
-  if ( MatrixMatrixProductJitter != NULL )              mkl_jit_destroy( MatrixMatrixProductJitter );
-  if ( MatrixVectorProductJitterBetaZero != NULL )      mkl_jit_destroy( MatrixVectorProductJitterBetaZero );
-  if ( MatrixVectorProductJitterBetaOne  != NULL )      mkl_jit_destroy( MatrixVectorProductJitterBetaOne );
-  if ( MatrixVectorProductJitterAlphaMinusOne != NULL ) mkl_jit_destroy( MatrixVectorProductJitterAlphaMinusOne );
-  if ( MatrixVectorProductTranspJitterBetaOne != NULL ) mkl_jit_destroy( MatrixVectorProductTranspJitterBetaOne );
-  if ( mkl_ipiv != NULL ) delete [] mkl_ipiv;
+  if ( MatrixMatrixProductJitter != nullptr )              mkl_jit_destroy( MatrixMatrixProductJitter );
+  if ( MatrixVectorProductJitterBetaZero != nullptr )      mkl_jit_destroy( MatrixVectorProductJitterBetaZero );
+  if ( MatrixVectorProductJitterBetaOne  != nullptr )      mkl_jit_destroy( MatrixVectorProductJitterBetaOne );
+  if ( MatrixVectorProductJitterAlphaMinusOne != nullptr ) mkl_jit_destroy( MatrixVectorProductJitterAlphaMinusOne );
+  if ( MatrixVectorProductTranspJitterBetaOne != nullptr ) mkl_jit_destroy( MatrixVectorProductTranspJitterBetaOne );
 #endif
 
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::Initialize(unsigned long nPoint, unsigned long nPointDomain,
-                            unsigned short nVar, unsigned short nEqn,
+void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npointdomain,
+                            unsigned short nvar, unsigned short neqn,
                             bool EdgeConnect, CGeometry *geometry, CConfig *config) {
 
-  /*--- Don't delete *row_ptr, *col_ind because they are
-   asigned to the Jacobian structure. ---*/
+  assert(omp_get_thread_num()==0 && "Only the master thread is allowed to initialize the matrix.");
 
-  unsigned long iPoint, *row_ptr, *col_ind, index, nnz, Elem, iVar;
-  unsigned short iNeigh, iElem, iNode, *nNeigh, *nNeigh_ilu;
-  vector<unsigned long>::iterator it;
-  vector<unsigned long> vneighs, vneighs_ilu;
+  if(matrix != nullptr) {
+    SU2_OMP_MASTER
+    SU2_MPI::Error("CSysMatrix can only be initialized once.", CURRENT_FUNCTION);
+  }
 
-  /*--- Set the ILU fill in level --*/
+  if(nvar > MAXNVAR) {
+    SU2_OMP_MASTER
+    SU2_MPI::Error("nVar larger than expected, increase MAXNVAR.", CURRENT_FUNCTION);
+  }
 
-  ilu_fill_in = config->GetLinear_Solver_ILU_n();
+  /*--- Application of this matrix, FVM or FEM. ---*/
+  auto type = EdgeConnect? ConnectivityType::FiniteVolume : ConnectivityType::FiniteElement;
 
-  /*--- Compute the number of neighbors ---*/
+  /*--- Types of preconditioner the matrix will be asked to build. ---*/
+  unsigned short sol_prec = config->GetKind_Linear_Solver_Prec();
+  unsigned short def_prec = config->GetKind_Deform_Linear_Solver_Prec();
+  unsigned short adj_prec = config->GetKind_DiscAdj_Linear_Prec();
+  bool adjoint = config->GetDiscrete_Adjoint();
 
-  nNeigh = new unsigned short [nPoint];
-  for (iPoint = 0; iPoint < nPoint; iPoint++) {
+  bool ilu_needed = (sol_prec==ILU) || (def_prec==ILU) || (adjoint && (adj_prec==ILU));
 
-    if (EdgeConnect) {
-      nNeigh[iPoint] = (geometry->node[iPoint]->GetnPoint()+1);  // +1 -> to include diagonal element
-    }
-    else {
-      vneighs.clear();
-      for (iElem = 0; iElem < geometry->node[iPoint]->GetnElem(); iElem++) {
-        Elem =  geometry->node[iPoint]->GetElem(iElem);
-        for (iNode = 0; iNode < geometry->elem[Elem]->GetnNodes(); iNode++)
-          vneighs.push_back(geometry->elem[Elem]->GetNode(iNode));
-      }
-      vneighs.push_back(iPoint);
+  /*--- Basic dimensions. ---*/
+  nVar = nvar;
+  nEqn = neqn;
+  nPoint = npoint;
+  nPointDomain = npointdomain;
 
-      sort(vneighs.begin(), vneighs.end());
-      it = unique(vneighs.begin(), vneighs.end());
-      vneighs.resize(it - vneighs.begin());
-      nNeigh[iPoint] = vneighs.size();
-    }
+  /*--- Get sparse structure pointers from geometry,
+   *    the data is managed by CGeometry to allow re-use. ---*/
 
-  }
+  const auto& csr = geometry->GetSparsePattern(type,0);
 
-  /*--- Create row_ptr structure, using the number of neighbors ---*/
+  row_ptr = csr.outerPtr();
+  col_ind = csr.innerIdx();
+  dia_ptr = csr.diagPtr();
+  nnz = csr.getNumNonZeros();
 
-  row_ptr = new unsigned long [nPoint+1];
-  row_ptr[0] = 0;
-  for (iPoint = 0; iPoint < nPoint; iPoint++)
-    row_ptr[iPoint+1] = row_ptr[iPoint] + nNeigh[iPoint];
-  nnz = row_ptr[nPoint];
+  if (type == ConnectivityType::FiniteVolume)
+    edge_ptr.ptr = geometry->GetEdgeToSparsePatternMap().data();
 
-  /*--- Create col_ind structure ---*/
+  /*--- Get ILU sparse pattern, if fill is 0 no new data is allocated. --*/
 
-  col_ind = new unsigned long [nnz];
-  for (iPoint = 0; iPoint < nPoint; iPoint++) {
+  if(ilu_needed)
+  {
+    ilu_fill_in = config->GetLinear_Solver_ILU_n();
 
-    vneighs.clear();
+    const auto& csr_ilu = geometry->GetSparsePattern(type, ilu_fill_in);
 
-    if (EdgeConnect) {
-      for (iNeigh = 0; iNeigh < geometry->node[iPoint]->GetnPoint(); iNeigh++)
-        vneighs.push_back(geometry->node[iPoint]->GetPoint(iNeigh));
-      vneighs.push_back(iPoint);
-    }
-    else {
-      for (iElem = 0; iElem < geometry->node[iPoint]->GetnElem(); iElem++) {
-        Elem =  geometry->node[iPoint]->GetElem(iElem);
-        for (iNode = 0; iNode < geometry->elem[Elem]->GetnNodes(); iNode++)
-          vneighs.push_back(geometry->elem[Elem]->GetNode(iNode));
-      }
-      vneighs.push_back(iPoint);
-    }
+    row_ptr_ilu = csr_ilu.outerPtr();
+    col_ind_ilu = csr_ilu.innerIdx();
+    dia_ptr_ilu = csr_ilu.diagPtr();
+    nnz_ilu = csr_ilu.getNumNonZeros();
+  }
 
-    sort(vneighs.begin(), vneighs.end());
-    it = unique(vneighs.begin(), vneighs.end());
-    vneighs.resize( it - vneighs.begin() );
+  /*--- Allocate data. ---*/
+#define ALLOC_AND_INIT(ptr,num) {\
+  ptr = MemoryAllocation::aligned_alloc<ScalarType>(64,num*sizeof(ScalarType));\
+  for(size_t k=0; k<num; ++k) ptr[k]=0.0; }
 
-    index = row_ptr[iPoint];
-    for (iNeigh = 0; iNeigh < vneighs.size(); iNeigh++) {
-      col_ind[index] = vneighs[iNeigh];
-      index++;
-    }
+  ALLOC_AND_INIT(matrix, nnz*nVar*nEqn)
 
+  /*--- Preconditioners. ---*/
+
+  if (ilu_needed) {
+    ALLOC_AND_INIT(ILU_matrix, nnz_ilu*nVar*nEqn)
+  }
+
+  if (ilu_needed || (sol_prec==JACOBI) || (sol_prec==LINELET) ||
+      (adjoint && (adj_prec==JACOBI)) || (def_prec==JACOBI))
+  {
+    ALLOC_AND_INIT(invM, nPointDomain*nVar*nEqn);
   }
+#undef ALLOC_AND_INIT
+
+  /*--- Thread parallel initialization. ---*/
+
+  int num_threads = omp_get_max_threads();
+
+  /*--- Set suitable chunk sizes for light static for loops, and heavy
+   dynamic ones, such that threads are approximately evenly loaded. ---*/
+  omp_light_size = computeStaticChunkSize(nnz*nVar*nEqn, num_threads, OMP_MAX_SIZE_L);
+  omp_heavy_size = computeStaticChunkSize(nPointDomain, num_threads, OMP_MAX_SIZE_H);
+
+  /// TODO: Get this from the config.
+  omp_num_parts = num_threads;
 
-  /*--- Set the indices in the in the sparce matrix structure, and memory allocation ---*/
+  /*--- This is akin to the row_ptr. ---*/
+  omp_partitions = new unsigned long [omp_num_parts+1];
 
-  SetIndexes(nPoint, nPointDomain, nVar, nEqn, row_ptr, col_ind, nnz, config);
+  /// TODO: Use a work estimate to produce more balanced partitions.
+  auto pts_per_part = roundUpDiv(nPointDomain, omp_num_parts);
+  for(auto part = 0ul; part < omp_num_parts; ++part)
+    omp_partitions[part] = part * pts_per_part;
+  omp_partitions[omp_num_parts] = nPointDomain;
 
   /*--- Generate MKL Kernels ---*/
 
@@ -201,183 +206,16 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long nPoint, unsigned long nPoi
 
   mkl_jit_create_dgemm( &MatrixVectorProductTranspJitterBetaOne, MKL_COL_MAJOR, MKL_NOTRANS, MKL_NOTRANS, nVar, 1, nVar,  1.0, nVar, nVar, 1.0, nVar );
   MatrixVectorProductTranspKernelBetaOne = mkl_jit_get_dgemm_ptr( MatrixVectorProductTranspJitterBetaOne );
-
-  mkl_ipiv = new lapack_int [ nVar ];
 #endif
 
-  /*--- Initialization matrix to zero ---*/
-
-  SetValZero();
-
-  delete [] nNeigh;
-
-  /*--- ILU(n) preconditioner with a specific sparse structure ---*/
-
-  if (ilu_fill_in != 0) {
-
-    nNeigh_ilu = new unsigned short [nPoint];
-    for (iPoint = 0; iPoint < nPoint; iPoint++) {
-
-      vneighs_ilu.clear();
-      SetNeighbours(geometry, iPoint, 0, ilu_fill_in, EdgeConnect, vneighs_ilu);
-      sort(vneighs_ilu.begin(), vneighs_ilu.end());
-      it = unique(vneighs_ilu.begin(), vneighs_ilu.end());
-      vneighs_ilu.resize(it - vneighs_ilu.begin());
-      nNeigh_ilu[iPoint] = vneighs_ilu.size();
-
-    }
-
-    row_ptr_ilu = new unsigned long [nPoint+1];
-    row_ptr_ilu[0] = 0;
-    for (iPoint = 0; iPoint < nPoint; iPoint++)
-      row_ptr_ilu[iPoint+1] = row_ptr_ilu[iPoint] + nNeigh_ilu[iPoint];
-    nnz_ilu = row_ptr_ilu[nPoint];
-
-    /*--- Create col_ind structure ---*/
-
-    col_ind_ilu = new unsigned long [nnz_ilu];
-    for (iPoint = 0; iPoint < nPoint; iPoint++) {
-
-      vneighs_ilu.clear();
-      SetNeighbours(geometry, iPoint, 0, ilu_fill_in, EdgeConnect, vneighs_ilu);
-      sort(vneighs_ilu.begin(), vneighs_ilu.end());
-      it = unique(vneighs_ilu.begin(), vneighs_ilu.end());
-      vneighs_ilu.resize( it - vneighs_ilu.begin() );
-
-      index = row_ptr_ilu[iPoint];
-      for (iNeigh = 0; iNeigh < vneighs_ilu.size(); iNeigh++) {
-        col_ind_ilu[index] = vneighs_ilu[iNeigh];
-        index++;
-      }
-
-    }
-
-    ILU_matrix = new ScalarType [nnz_ilu*nVar*nEqn];
-    for (iVar = 0; iVar < nnz_ilu*nVar*nEqn; iVar++) ILU_matrix[iVar] = 0.0;
-
-    invM = new ScalarType [nPointDomain*nVar*nEqn];
-    for (iVar = 0; iVar < nPointDomain*nVar*nEqn; iVar++) invM[iVar] = 0.0;
-
-    delete [] nNeigh_ilu;
-
-  }
-
-}
-
-template<class ScalarType>
-void CSysMatrix<ScalarType>::SetNeighbours(CGeometry *geometry, unsigned long iPoint, unsigned short deep_level, unsigned short fill_level,
-                               bool EdgeConnect, vector<unsigned long> & vneighs) {
-  unsigned long Point, iElem, Elem;
-  unsigned short iNode;
-
-
-  if (EdgeConnect) {
-    vneighs.push_back(iPoint);
-    for (iNode = 0; iNode < geometry->node[iPoint]->GetnPoint(); iNode++) {
-      Point = geometry->node[iPoint]->GetPoint(iNode);
-      vneighs.push_back(Point);
-      if (deep_level < fill_level) SetNeighbours(geometry, Point, deep_level+1, fill_level, EdgeConnect, vneighs);
-    }
-  }
-  else {
-    for (iElem = 0; iElem < geometry->node[iPoint]->GetnElem(); iElem++) {
-      Elem =  geometry->node[iPoint]->GetElem(iElem);
-      for (iNode = 0; iNode < geometry->elem[Elem]->GetnNodes(); iNode++) {
-        Point = geometry->elem[Elem]->GetNode(iNode);
-        vneighs.push_back(Point);
-        if (deep_level < fill_level) SetNeighbours(geometry, Point, deep_level+1, fill_level, EdgeConnect, vneighs);
-      }
-    }
-  }
-
-}
-
-template<class ScalarType>
-void CSysMatrix<ScalarType>::SetIndexes(unsigned long val_nPoint, unsigned long val_nPointDomain, unsigned short val_nVar, unsigned short val_nEq, unsigned long* val_row_ptr, unsigned long* val_col_ind, unsigned long val_nnz, CConfig *config) {
-
-  unsigned long iVar;
-
-  nPoint       = val_nPoint;        // Assign number of points in the mesh
-  nPointDomain = val_nPointDomain;  // Assign number of points in the mesh
-  nVar         = val_nVar;          // Assign number of vars in each block system
-  nEqn         = val_nEq;           // Assign number of eqns in each block system
-
-  row_ptr      = val_row_ptr;       // Assign row values in the spare system structure (Jacobian structure)
-  col_ind      = val_col_ind;       // Assign colums values in the spare system structure (Jacobian structure)
-  nnz          = val_nnz;           // Assign number of possible non zero blocks in the spare system structure (Jacobian structure)
-
-  if (ilu_fill_in == 0) {
-    row_ptr_ilu  = val_row_ptr;       // Assign row values in the spare system structure (ILU structure)
-    col_ind_ilu  = val_col_ind;       // Assign colums values in the spare system structure (ILU structure)
-    nnz_ilu      = val_nnz;           // Assign number of possible non zero blocks in the spare system structure (ILU structure)
-  }
-
-  matrix            = new ScalarType [nnz*nVar*nEqn];  // Reserve memory for the values of the matrix
-  block             = new ScalarType [nVar*nEqn];
-  block_weight      = new ScalarType [nVar*nEqn];
-  block_inverse     = new ScalarType [nVar*nEqn];
-
-  prod_row_vector   = new ScalarType [nVar];
-  aux_vector        = new ScalarType [nVar];
-  sum_vector        = new ScalarType [nVar];
-
-  /*--- Memory initialization ---*/
-
-  for (iVar = 0; iVar < nnz*nVar*nEqn; iVar++) matrix[iVar] = 0.0;
-  for (iVar = 0; iVar < nVar*nEqn; iVar++)     block[iVar] = 0.0;
-  for (iVar = 0; iVar < nVar*nEqn; iVar++)     block_weight[iVar] = 0.0;
-  for (iVar = 0; iVar < nVar*nEqn; iVar++)     block_inverse[iVar] = 0.0;
-
-  for (iVar = 0; iVar < nVar; iVar++)          prod_row_vector[iVar] = 0.0;
-  for (iVar = 0; iVar < nVar; iVar++)          aux_vector[iVar] = 0.0;
-  for (iVar = 0; iVar < nVar; iVar++)          sum_vector[iVar] = 0.0;
-
-  if (ilu_fill_in == 0) {
-
-    /*--- Set specific preconditioner matrices (ILU) ---*/
-
-    if ((config->GetKind_Linear_Solver_Prec() == ILU) ||
-        ((config->GetKind_SU2() == SU2_DEF) && (config->GetKind_Deform_Linear_Solver_Prec() == ILU)) ||
-        ((config->GetKind_SU2() == SU2_DOT) && (config->GetKind_Deform_Linear_Solver_Prec() == ILU)) ||
-        (config->GetKind_Deform_Linear_Solver_Prec() == ILU) ||
-        (config->GetDiscrete_Adjoint() && config->GetKind_DiscAdj_Linear_Prec() == ILU)) {
-
-      /*--- Reserve memory for the ILU matrix. ---*/
-
-      ILU_matrix = new ScalarType [nnz_ilu*nVar*nEqn];
-      for (iVar = 0; iVar < nnz_ilu*nVar*nEqn; iVar++) ILU_matrix[iVar] = 0.0;
-
-      invM = new ScalarType [nPointDomain*nVar*nEqn];
-      for (iVar = 0; iVar < nPointDomain*nVar*nEqn; iVar++) invM[iVar] = 0.0;
-
-    }
-
-  }
-
-  /*--- Set specific preconditioner matrices (Jacobi and Linelet) ---*/
-
-  if ((config->GetKind_Linear_Solver_Prec() == JACOBI) ||
-      (config->GetKind_Linear_Solver_Prec() == LINELET) ||
-     ((config->GetKind_SU2() == SU2_DEF) && (config->GetKind_Deform_Linear_Solver_Prec() == JACOBI)) ||
-     ((config->GetKind_SU2() == SU2_DOT) && (config->GetKind_Deform_Linear_Solver_Prec() == JACOBI)) ||
-      (config->GetDiscrete_Adjoint() && config->GetKind_DiscAdj_Linear_Solver() == JACOBI) ||
-      (config->GetFSI_Simulation() && config->GetKind_Deform_Linear_Solver_Prec() == JACOBI))   {
-
-    /*--- Reserve memory for the values of the inverse of the preconditioner. ---*/
-
-    invM = new ScalarType [nPointDomain*nVar*nEqn];
-    for (iVar = 0; iVar < nPointDomain*nVar*nEqn; iVar++) invM[iVar] = 0.0;
-
-  }
-
 }
 
 template<class ScalarType>
 template<class OtherType>
-void CSysMatrix<ScalarType>::InitiateComms(CSysVector<OtherType> & x,
+void CSysMatrix<ScalarType>::InitiateComms(const CSysVector<OtherType> & x,
                                            CGeometry *geometry,
                                            CConfig *config,
-                                           unsigned short commType) {
+                                           unsigned short commType) const {
 
   /*--- Local variables ---*/
 
@@ -527,7 +365,7 @@ template<class OtherType>
 void CSysMatrix<ScalarType>::CompleteComms(CSysVector<OtherType> & x,
                                            CGeometry *geometry,
                                            CConfig *config,
-                                           unsigned short commType) {
+                                           unsigned short commType) const {
 
   /*--- Local variables ---*/
 
@@ -539,7 +377,7 @@ void CSysMatrix<ScalarType>::CompleteComms(CSysVector<OtherType> & x,
 
   /*--- Set some local pointers to make access simpler. ---*/
 
-  su2double *bufDRecv = geometry->bufD_P2PRecv;
+  const su2double *bufDRecv = geometry->bufD_P2PRecv;
 
   /*--- Store the data that was communicated into the appropriate
    location within the local class data structures. ---*/
@@ -650,129 +488,201 @@ void CSysMatrix<ScalarType>::CompleteComms(CSysVector<OtherType> & x,
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::DeleteValsRowi(unsigned long i) {
-
-  unsigned long block_i = i/nVar;
-  unsigned long row = i - block_i*nVar;
-  unsigned long index, iVar;
-
-  for (index = row_ptr[block_i]; index < row_ptr[block_i+1]; index++) {
-    for (iVar = 0; iVar < nVar; iVar++)
-      matrix[index*nVar*nVar+row*nVar+iVar] = 0.0; // Delete row values in the block
-    if (col_ind[index] == block_i)
-      matrix[index*nVar*nVar+row*nVar+row] = 1.0; // Set 1 to the diagonal element
-  }
-
+void CSysMatrix<ScalarType>::SetValZero() {
+  SU2_OMP_FOR_STAT(omp_light_size)
+  for (auto index = 0ul; index < nnz*nVar*nEqn; index++)
+    matrix[index] = 0.0;
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::UpperProduct(const CSysVector<ScalarType> & vec, unsigned long row_i) {
+void CSysMatrix<ScalarType>::SetValDiagonalZero() {
+  SU2_OMP_FOR_STAT(omp_heavy_size)
+  for (auto iPoint = 0ul; iPoint < nPointDomain; ++iPoint)
+    for (auto index = 0ul; index < nVar*nEqn; ++index)
+      matrix[dia_ptr[iPoint]*nVar*nEqn + index] = 0.0;
+}
 
-  unsigned long iVar, index, col_j;
+template<class ScalarType>
+void CSysMatrix<ScalarType>::Gauss_Elimination(ScalarType* matrix, ScalarType* vec) const {
 
-  for (iVar = 0; iVar < nVar; iVar++)
-    prod_row_vector[iVar] = 0;
+#ifdef USE_MKL_LAPACK
+  // With MKL_DIRECT_CALL enabled, this is significantly faster than native code on Intel Architectures.
+  lapack_int ipiv[MAXNVAR];
+  LAPACKE_dgetrf( LAPACK_ROW_MAJOR, nVar, nVar, matrix, nVar, ipiv);
+  LAPACKE_dgetrs( LAPACK_ROW_MAJOR, 'N', nVar, 1, matrix, nVar, ipiv, vec, 1 );
+#else
+#define A(I,J) matrix[(I)*nVar+(J)]
 
-  for (index = row_ptr[row_i]; index < row_ptr[row_i+1]; index++) {
-    col_j = col_ind[index];
-    if (col_j > row_i) {
-      MatrixVectorProductAdd(&matrix[index*nVar*nVar], &vec[col_j*nVar], prod_row_vector);
+  /*--- Transform system in Upper Matrix ---*/
+  for (auto iVar = 1ul; iVar < nVar; iVar++) {
+    for (auto jVar = 0ul; jVar < iVar; jVar++) {
+      ScalarType weight = A(iVar,jVar) / A(jVar,jVar);
+      for (auto kVar = jVar; kVar < nVar; kVar++)
+        A(iVar,kVar) -= weight * A(jVar,kVar);
+      vec[iVar] -= weight * vec[jVar];
     }
   }
 
+  /*--- Backwards substitution ---*/
+  for (auto iVar = nVar; iVar > 0ul;) {
+    iVar--; // unsigned type
+    for (auto jVar = iVar+1; jVar < nVar; jVar++)
+      vec[iVar] -= A(iVar,jVar) * vec[jVar];
+    vec[iVar] /= A(iVar,iVar);
+  }
+#undef A
+#endif
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::LowerProduct(const CSysVector<ScalarType> & vec, unsigned long row_i) {
+void CSysMatrix<ScalarType>::MatrixInverse(ScalarType *matrix, ScalarType *inverse) const {
 
-  unsigned long iVar, index, col_j;
+  /*--- This is a generalization of Gaussian elimination for multiple rhs' (the basis vectors).
+   We could call "Gauss_Elimination" multiple times or fully generalize it for multiple rhs,
+   the performance of both routines would suffer in both cases without the use of exotic templating.
+   And so it feels reasonable to have some duplication here. ---*/
 
-  for (iVar = 0; iVar < nVar; iVar++)
-    prod_row_vector[iVar] = 0;
+  assert((matrix != inverse) && "Output cannot be the same as the input.");
 
-  for (index = row_ptr[row_i]; index < row_ptr[row_i+1]; index++) {
-    col_j = col_ind[index];
-    if (col_j < row_i) {
-      MatrixVectorProductAdd(&matrix[index*nVar*nVar], &vec[col_j*nVar], prod_row_vector);
+#define M(I,J) inverse[(I)*nVar+(J)]
+
+  /*--- Initialize the inverse with the identity. ---*/
+  for (auto iVar = 0ul; iVar < nVar; iVar++)
+    for (auto jVar = 0ul; jVar < nVar; jVar++)
+      M(iVar,jVar) = ScalarType(iVar==jVar);
+
+  /*--- Inversion ---*/
+#ifdef USE_MKL_LAPACK
+  // With MKL_DIRECT_CALL enabled, this is significantly faster than native code on Intel Architectures.
+  lapack_int ipiv[MAXNVAR];
+  LAPACKE_dgetrf( LAPACK_ROW_MAJOR, nVar, nVar, matrix, nVar, ipiv );
+  LAPACKE_dgetrs( LAPACK_ROW_MAJOR, 'N', nVar, nVar, matrix, nVar, ipiv, inverse, nVar );
+#else
+#define A(I,J) matrix[(I)*nVar+(J)]
+
+  /*--- Transform system in Upper Matrix ---*/
+  for (auto iVar = 1ul; iVar < nVar; iVar++) {
+    for (auto jVar = 0ul; jVar < iVar; jVar++)
+    {
+      ScalarType weight = A(iVar,jVar) / A(jVar,jVar);
+
+      for (auto kVar = jVar; kVar < nVar; kVar++)
+        A(iVar,kVar) -= weight * A(jVar,kVar);
+
+      /*--- at this stage M is lower triangular so not all cols need updating ---*/
+      for (auto kVar = 0ul; kVar <= jVar; kVar++)
+        M(iVar,kVar) -= weight * M(jVar,kVar);
     }
   }
 
+  /*--- Backwards substitution ---*/
+  for (auto iVar = nVar; iVar > 0ul;) {
+    iVar--; // unsigned type
+    for (auto jVar = iVar+1; jVar < nVar; jVar++)
+      for (auto kVar = 0ul; kVar < nVar; kVar++)
+        M(iVar,kVar) -= A(iVar,jVar) * M(jVar,kVar);
+
+    for (auto kVar = 0ul; kVar < nVar; kVar++)
+      M(iVar,kVar) /= A(iVar,iVar);
+  }
+#undef A
+#endif
+#undef M
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::DiagonalProduct(const CSysVector<ScalarType> & vec, unsigned long row_i) {
+void CSysMatrix<ScalarType>::DeleteValsRowi(unsigned long i) {
 
-  for (unsigned long index = row_ptr[row_i]; index < row_ptr[row_i+1]; index++) {
-    if (col_ind[index] == row_i) {
-      MatrixVectorProduct(&matrix[index*nVar*nVar], &vec[row_i*nVar], prod_row_vector);
-      break;
-    }
+  unsigned long block_i = i/nVar;
+  unsigned long row = i - block_i*nVar;
+  unsigned long index, iVar;
+
+  for (index = row_ptr[block_i]; index < row_ptr[block_i+1]; index++) {
+    for (iVar = 0; iVar < nVar; iVar++)
+      matrix[index*nVar*nVar+row*nVar+iVar] = 0.0; // Delete row values in the block
+    if (col_ind[index] == block_i)
+      matrix[index*nVar*nVar+row*nVar+row] = 1.0; // Set 1 to the diagonal element
   }
 
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::RowProduct(const CSysVector<ScalarType> & vec, unsigned long row_i) {
-
+void CSysMatrix<ScalarType>::RowProduct(const CSysVector<ScalarType> & vec,
+                                        unsigned long row_i, ScalarType *prod) const {
   unsigned long iVar, index, col_j;
 
-  for (iVar = 0; iVar < nVar; iVar++)
-    prod_row_vector[iVar] = 0;
+  for (iVar = 0; iVar < nVar; iVar++) prod[iVar] = 0.0;
 
   for (index = row_ptr[row_i]; index < row_ptr[row_i+1]; index++) {
     col_j = col_ind[index];
-    MatrixVectorProductAdd(&matrix[index*nVar*nVar], &vec[col_j*nVar], prod_row_vector);
+    MatrixVectorProductAdd(&matrix[index*nVar*nVar], &vec[col_j*nVar], prod);
   }
 
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::MatrixVectorProduct(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config) {
-
-  unsigned long prod_begin, vec_begin, mat_begin, index, row_i;
+void CSysMatrix<ScalarType>::MatrixVectorProduct(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                                 CGeometry *geometry, CConfig *config) const {
 
   /*--- Some checks for consistency between CSysMatrix and the CSysVector<ScalarType>s ---*/
+#ifndef NDEBUG
   if ( (nVar != vec.GetNVar()) || (nVar != prod.GetNVar()) ) {
-    cerr << "CSysMatrix<ScalarType>::MatrixVectorProduct(const CSysVector<ScalarType>&, CSysVector<ScalarType>): "
-    << "nVar values incompatible." << endl;
-    throw(-1);
+    SU2_OMP_MASTER
+    SU2_MPI::Error("nVar values incompatible.", CURRENT_FUNCTION);
   }
   if ( (nPoint != vec.GetNBlk()) || (nPoint != prod.GetNBlk()) ) {
-    cerr << "CSysMatrix<ScalarType>::MatrixVectorProduct(const CSysVector<ScalarType>&, CSysVector<ScalarType>): "
-    << "nPoint and nBlk values incompatible." << endl;
-    throw(-1);
+    SU2_OMP_MASTER
+    SU2_MPI::Error("nPoint and nBlk values incompatible.", CURRENT_FUNCTION);
   }
+#endif
 
-  prod = ScalarType(0.0); // set all entries of prod to zero
-  for (row_i = 0; row_i < nPointDomain; row_i++) {
-    prod_begin = row_i*nVar; // offset to beginning of block row_i
-    for (index = row_ptr[row_i]; index < row_ptr[row_i+1]; index++) {
-      vec_begin = col_ind[index]*nVar; // offset to beginning of block col_ind[index]
-      mat_begin = (index*nVar*nVar); // offset to beginning of matrix block[row_i][col_ind[indx]]
+  /*--- OpenMP parallelization. First need to make view of vectors
+   *    consistent, a barrier is implicit at the end of FOR section
+   *    (and it is required before master thread communicates). ---*/
+
+  SU2_OMP_BARRIER
+
+  SU2_OMP_FOR_DYN(omp_heavy_size)
+  for (auto row_i = 0ul; row_i < nPointDomain; row_i++) {
+    auto prod_begin = row_i*nVar; // offset to beginning of block row_i
+    for(auto iVar = 0ul; iVar < nVar; iVar++)
+      prod[prod_begin+iVar] = 0.0;
+    for (auto index = row_ptr[row_i]; index < row_ptr[row_i+1]; index++) {
+      auto vec_begin = col_ind[index]*nVar; // offset to beginning of block col_ind[index]
+      auto mat_begin = index*nVar*nVar; // offset to beginning of matrix block[row_i][col_ind[indx]]
       MatrixVectorProductAdd(&matrix[mat_begin], &vec[vec_begin], &prod[prod_begin]);
     }
   }
 
-  /*--- MPI Parallelization ---*/
-
-  InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
-  CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  /*--- MPI Parallelization by master thread. ---*/
 
+  SU2_OMP_MASTER
+  {
+    InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
+    CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  }
+  SU2_OMP_BARRIER
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::MatrixVectorProductTransposed(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config) {
+void CSysMatrix<ScalarType>::MatrixVectorProductTransposed(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                                           CGeometry *geometry, CConfig *config) const {
 
   unsigned long prod_begin, vec_begin, mat_begin, index, row_i;
 
   /*--- Some checks for consistency between CSysMatrix and the CSysVector<ScalarType>s ---*/
+#ifndef NDEBUG
   if ( (nVar != vec.GetNVar()) || (nVar != prod.GetNVar()) ) {
+    SU2_OMP_MASTER
     SU2_MPI::Error("nVar values incompatible.", CURRENT_FUNCTION);
   }
   if ( (nPoint != vec.GetNBlk()) || (nPoint != prod.GetNBlk()) ) {
+    SU2_OMP_MASTER
     SU2_MPI::Error("nPoint and nBlk values incompatible.", CURRENT_FUNCTION);
   }
+#endif
 
+  /// TODO: The transpose product requires a different thread-parallel strategy.
   prod = ScalarType(0.0); // set all entries of prod to zero
   for (row_i = 0; row_i < nPointDomain; row_i++) {
     vec_begin = row_i*nVar; // offset to beginning of block col_ind[index]
@@ -794,208 +704,282 @@ template<class ScalarType>
 void CSysMatrix<ScalarType>::BuildJacobiPreconditioner(bool transpose) {
 
   /*--- Build Jacobi preconditioner (M = D), compute and store the inverses of the diagonal blocks. ---*/
+  SU2_OMP(for schedule(dynamic,omp_heavy_size) nowait)
   for (unsigned long iPoint = 0; iPoint < nPointDomain; iPoint++)
     InverseDiagonalBlock(iPoint, &(invM[iPoint*nVar*nVar]), transpose);
 
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::ComputeJacobiPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config) {
+void CSysMatrix<ScalarType>::ComputeJacobiPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                                         CGeometry *geometry, CConfig *config) const {
 
   /*--- Apply Jacobi preconditioner, y = D^{-1} * x, the inverse of the diagonal is already known. ---*/
+  SU2_OMP_BARRIER
+  SU2_OMP_FOR_DYN(omp_heavy_size)
   for (unsigned long iPoint = 0; iPoint < nPointDomain; iPoint++)
     MatrixVectorProduct(&(invM[iPoint*nVar*nVar]), &vec[iPoint*nVar], &prod[iPoint*nVar]);
 
   /*--- MPI Parallelization ---*/
-  InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
-  CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
-
+  SU2_OMP_MASTER
+  {
+    InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
+    CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  }
+  SU2_OMP_BARRIER
 }
 
 template<class ScalarType>
 void CSysMatrix<ScalarType>::BuildILUPreconditioner(bool transposed) {
 
-  unsigned long index, index_, iVar;
-  ScalarType *Block_ij;
-  const ScalarType *Block_jk;
-  long iPoint, jPoint, kPoint;
-
-  /*--- Copy block matrix, note that the original matrix
-   is modified by the algorithm, so that we have the factorization stored
-   in the ILUMatrix at the end of this preprocessing. ---*/
-
-  for (iVar = 0; iVar < nnz_ilu*nVar*nEqn; iVar++) ILU_matrix[iVar] = 0.0;
-
-  for (iPoint = 0; iPoint < (long)nPointDomain; iPoint++) {
-    for (index = row_ptr[iPoint]; index < row_ptr[iPoint+1]; index++) {
-      jPoint = col_ind[index];
-      if (transposed) {
-        Block_ij = GetBlock(jPoint, iPoint);
-        SetBlockTransposed_ILUMatrix(iPoint, jPoint, Block_ij);
-      } else {
-        Block_ij = GetBlock(iPoint, jPoint);
-        SetBlock_ILUMatrix(iPoint, jPoint, Block_ij);
+  /*--- Copy block matrix to compute factorization in-place. ---*/
+
+  if ((ilu_fill_in == 0) && !transposed) {
+    /*--- ILU0, direct copy. ---*/
+    SU2_OMP_FOR_STAT(omp_light_size)
+    for (auto iVar = 0ul; iVar < nnz*nVar*nVar; ++iVar)
+      ILU_matrix[iVar] = matrix[iVar];
+  }
+  else {
+    /*--- ILUn clear the ILU matrix first, for ILU0^T
+     *    the copy takes care of the clearing. ---*/
+    if (ilu_fill_in > 0) {
+      SU2_OMP_FOR_STAT(omp_light_size)
+      for (auto iVar = 0ul; iVar < nnz_ilu*nVar*nVar; iVar++)
+        ILU_matrix[iVar] = 0.0;
+    }
+
+    /*--- Transposed or ILUn, traverse matrix to access its blocks
+     *    sequentially and set them in the ILU matrix. ---*/
+    SU2_OMP_FOR_DYN(omp_heavy_size)
+    for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
+      for (auto index = row_ptr[iPoint]; index < row_ptr[iPoint+1]; index++) {
+        auto jPoint = col_ind[index];
+        if (transposed) {
+          SetBlockTransposed_ILUMatrix(jPoint, iPoint, &matrix[index*nVar*nVar]);
+        } else {
+          SetBlock_ILUMatrix(iPoint, jPoint, &matrix[index*nVar*nVar]);
+        }
       }
     }
   }
 
   /*--- Transform system in Upper Matrix ---*/
 
-  for (iPoint = 1; iPoint < (long)nPointDomain; iPoint++) {
+  /*--- OpenMP Parallelization, a loop construct is used to ensure
+   *    the preconditioner is computed correctly even if called
+   *    outside of a parallel section. ---*/
+
+  SU2_OMP_FOR_STAT(1)
+  for(unsigned long thread = 0; thread < omp_num_parts; ++thread)
+  {
+    const auto begin = omp_partitions[thread];
+    const auto end = omp_partitions[thread+1];
+
+    /*--- Each thread will work on the submatrix defined from row/col "begin"
+     *    to row/col "end-1" (i.e. the range [begin,end[). Which is exactly
+     *    what the MPI-only implementation does. ---*/
+
+    ScalarType weight[MAXNVAR*MAXNVAR], aux_block[MAXNVAR*MAXNVAR];
+
+    for (auto iPoint = begin+1; iPoint < end; iPoint++) {
 
-    /*--- Invert and store the previous diagonal block to later compute the weight. ---*/
+      /*--- Invert and store the previous diagonal block to later compute the weight. ---*/
 
-    InverseDiagonalBlock_ILUMatrix(iPoint-1, &invM[(iPoint-1)*nVar*nVar]);
+      InverseDiagonalBlock_ILUMatrix(iPoint-1, &invM[(iPoint-1)*nVar*nVar]);
 
-    /*--- For each row (unknown), loop over all entries in A on this row
-     row_ptr_ilu[iPoint+1] will have the index for the first entry on the next
-     row. ---*/
+      /*--- For this row (unknown), loop over its lower diagonal entries. ---*/
 
-    for (index = row_ptr_ilu[iPoint]; index < row_ptr_ilu[iPoint+1]; index++) {
+      for (auto index = row_ptr_ilu[iPoint]; index < dia_ptr_ilu[iPoint]; index++) {
 
-      /*--- jPoint here is the column for each entry on this row ---*/
+        /*--- jPoint is the column index (jPoint < iPoint). ---*/
 
-      jPoint = col_ind_ilu[index];
+        auto jPoint = col_ind_ilu[index];
 
-      /*--- Check that this column is in the lower triangular portion ---*/
+        /*--- We only care about the sub matrix within "begin" and "end-1". ---*/
 
-      if (jPoint < iPoint) {
+        if (jPoint < begin) continue;
 
-        /*--- If we're in the lower triangle, multiply the block by
-         the inverse of the corresponding diagonal block. ---*/
+        /*--- Multiply the block by the inverse of the corresponding diagonal block. ---*/
 
-        Block_ij = &ILU_matrix[index*nVar*nEqn];
-        MatrixMatrixProduct(Block_ij, &invM[jPoint*nVar*nVar], block_weight);
+        auto Block_ij = &ILU_matrix[index*nVar*nVar];
+        MatrixMatrixProduct(Block_ij, &invM[jPoint*nVar*nVar], weight);
 
-        /*--- block_weight holds Aij*inv(Ajj). Jump to the row for jPoint ---*/
+        /*--- "weight" holds Aij*inv(Ajj). Jump to the upper part of the jPoint row. ---*/
 
-        for (index_ = row_ptr_ilu[jPoint]; index_ < row_ptr_ilu[jPoint+1]; index_++) {
+        for (auto index_ = dia_ptr_ilu[jPoint]+1; index_ < row_ptr_ilu[jPoint+1]; index_++) {
 
-          /*--- Get the column of the entry ---*/
+          /*--- Get the column index (kPoint > jPoint). ---*/
 
-          kPoint = col_ind_ilu[index_];
+          auto kPoint = col_ind_ilu[index_];
 
-          /*--- If the column is greater than or equal to jPoint, i.e., the
-           upper triangular part, then multiply and modify the matrix.
-           Here, Aik' = Aik - Aij*inv(Ajj)*Ajk. ---*/
+          if (kPoint >= end) break;
 
-          if (kPoint > jPoint) {
+          /*--- If Aik exists, update it: Aik -= Aij*inv(Ajj)*Ajk ---*/
 
-            Block_jk = &ILU_matrix[index_*nVar*nEqn];
-            MatrixMatrixProduct(block_weight, Block_jk, block);
-            SubtractBlock_ILUMatrix(iPoint, kPoint, block);
+          auto Block_ik = GetBlock_ILUMatrix(iPoint, kPoint);
 
+          if (Block_ik != nullptr) {
+            auto Block_jk = &ILU_matrix[index_*nVar*nVar];
+            MatrixMatrixProduct(weight, Block_jk, aux_block);
+            MatrixSubtraction(Block_ik, aux_block, Block_ik);
           }
         }
 
-        /*--- Lastly, store block_weight in the lower triangular part, which
+        /*--- Lastly, store "weight" in the lower triangular part, which
          will be reused during the forward solve in the precon/smoother. ---*/
 
-        for (iVar = 0; iVar < nVar*nEqn; ++iVar)
-          Block_ij[iVar] = block_weight[iVar];
-
+        for (auto iVar = 0ul; iVar < nVar*nVar; ++iVar)
+          Block_ij[iVar] = weight[iVar];
       }
     }
-  }
+    InverseDiagonalBlock_ILUMatrix(end-1, &invM[(end-1)*nVar*nVar]);
 
-  InverseDiagonalBlock_ILUMatrix(nPointDomain-1, &invM[(nPointDomain-1)*nVar*nVar]);
+  } // end parallel
 
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::ComputeILUPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config) {
-
-  unsigned long index, iVar;
-  const ScalarType *Block_ij;
-  long iPoint, jPoint;
-
-  /*--- Copy vector to then work on prod in place ---*/
-
-  for (iPoint = 0; iPoint < long(nPointDomain*nVar); iPoint++)
-    prod[iPoint] = vec[iPoint];
-
-  /*--- Forward solve the system using the lower matrix entries that
-   were computed and stored during the ILU preprocessing. Note
-   that we are overwriting the residual vector as we go. ---*/
-
-  for (iPoint = 1; iPoint < (long)nPointDomain; iPoint++) {
-    for (index = row_ptr_ilu[iPoint]; index < row_ptr_ilu[iPoint+1]; index++) {
-      jPoint = col_ind_ilu[index];
-      if (jPoint < iPoint) {
-        Block_ij = &ILU_matrix[index*nVar*nEqn];
+void CSysMatrix<ScalarType>::ComputeILUPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                                      CGeometry *geometry, CConfig *config) const {
+  /*--- Coherent view of vectors. ---*/
+  SU2_OMP_BARRIER
+
+  /*--- OpenMP Parallelization ---*/
+  SU2_OMP_FOR_STAT(1)
+  for(unsigned long thread = 0; thread < omp_num_parts; ++thread)
+  {
+    const auto begin = omp_partitions[thread];
+    const auto end = omp_partitions[thread+1];
+
+    ScalarType aux_vec[MAXNVAR];
+
+    /*--- Copy vector to then work on prod in place ---*/
+
+    for (auto iVar = begin*nVar; iVar < end*nVar; iVar++)
+      prod[iVar] = vec[iVar];
+
+    /*--- Forward solve the system using the lower matrix entries that
+     were computed and stored during the ILU preprocessing. Note
+     that we are overwriting the residual vector as we go. ---*/
+
+    for (auto iPoint = begin+1; iPoint < end; iPoint++) {
+      for (auto index = row_ptr_ilu[iPoint]; index < dia_ptr_ilu[iPoint]; index++) {
+        auto jPoint = col_ind_ilu[index];
+        if (jPoint < begin) continue;
+        auto Block_ij = &ILU_matrix[index*nVar*nVar];
         MatrixVectorProductSub(Block_ij, &prod[jPoint*nVar], &prod[iPoint*nVar]);
       }
     }
-  }
 
-  /*--- Backwards substitution (starts at the last row) ---*/
+    /*--- Backwards substitution (starts at the last row) ---*/
 
-  for (iPoint = nPointDomain-1; iPoint >= 0; iPoint--) {
-
-    for (iVar = 0; iVar < nVar; iVar++)
-      sum_vector[iVar] = prod[iPoint*nVar+iVar];
+    for (auto iPoint = end; iPoint > begin;) {
+      iPoint--; // unsigned type
+      for (auto iVar = 0ul; iVar < nVar; iVar++)
+        aux_vec[iVar] = prod[iPoint*nVar+iVar];
 
-    for (index = row_ptr_ilu[iPoint]; index < row_ptr_ilu[iPoint+1]; index++) {
-      jPoint = col_ind_ilu[index];
-      if ((jPoint >= iPoint+1) && (jPoint < (long)nPointDomain)) {
-        Block_ij = &ILU_matrix[index*nVar*nEqn];
-        MatrixVectorProductSub(Block_ij, &prod[jPoint*nVar], sum_vector);
+      for (auto index = dia_ptr_ilu[iPoint]+1; index < row_ptr_ilu[iPoint+1]; index++) {
+        auto jPoint = col_ind_ilu[index];
+        if (jPoint >= end) break;
+        auto Block_ij = &ILU_matrix[index*nVar*nVar];
+        MatrixVectorProductSub(Block_ij, &prod[jPoint*nVar], aux_vec);
       }
-    }
 
-    MatrixVectorProduct(&invM[iPoint*nVar*nVar], sum_vector, &prod[iPoint*nVar]);
-  }
+      MatrixVectorProduct(&invM[iPoint*nVar*nVar], aux_vec, &prod[iPoint*nVar]);
+    }
+  } // end parallel
 
   /*--- MPI Parallelization ---*/
 
-  InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
-  CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
-
+  SU2_OMP_MASTER
+  {
+    InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
+    CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  }
+  SU2_OMP_BARRIER
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::ComputeLU_SGSPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod, CGeometry *geometry, CConfig *config) {
-  unsigned long iPoint, iVar;
+void CSysMatrix<ScalarType>::ComputeLU_SGSPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
+                                                         CGeometry *geometry, CConfig *config) const {
 
   /*--- First part of the symmetric iteration: (D+L).x* = b ---*/
 
-  for (iPoint = 0; iPoint < nPointDomain; iPoint++) {
-    LowerProduct(prod, iPoint);                                               // Compute L.x*
-    for (iVar = 0; iVar < nVar; iVar++)
-      prod[iPoint*nVar+iVar] = vec[iPoint*nVar+iVar] - prod_row_vector[iVar]; // Compute aux_vector = b - L.x*
-    Gauss_Elimination(iPoint, &prod[iPoint*nVar]);                            // Solve D.x* = aux_vector
-  }
+  /*--- Coherent view of vectors. ---*/
+  SU2_OMP_BARRIER
 
-  /*--- MPI Parallelization ---*/
+  /*--- OpenMP Parallelization ---*/
+  SU2_OMP_FOR_STAT(1)
+  for(unsigned long thread = 0; thread < omp_num_parts; ++thread)
+  {
+    const auto begin = omp_partitions[thread];
+    const auto end = omp_partitions[thread+1];
 
-  InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
-  CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+    /*--- Each thread will work on the submatrix defined from row/col "begin"
+     *    to row/col "end-1", except the last thread that also considers halos.
+     *    This is NOT exactly equivalent to the MPI implementation on the same
+     *    number of domains, for that we would need to define "thread-halos". ---*/
 
-  /*--- Second part of the symmetric iteration: (D+U).x_(1) = D.x* ---*/
+    ScalarType low_prod[MAXNVAR];
 
-  for (iPoint = nPointDomain-1; (int)iPoint >= 0; iPoint--) {
-    DiagonalProduct(prod, iPoint);                                        // Compute D.x*
-    for (iVar = 0; iVar < nVar; iVar++)
-      aux_vector[iVar] = prod_row_vector[iVar];                           // Compute aux_vector = D.x*
-    UpperProduct(prod, iPoint);                                           // Compute U.x_(n+1)
-    for (iVar = 0; iVar < nVar; iVar++)
-      prod[iPoint*nVar+iVar] = aux_vector[iVar] - prod_row_vector[iVar];  // Compute aux_vector = D.x*-U.x_(n+1)
-    Gauss_Elimination(iPoint, &prod[iPoint*nVar]);                        // Solve D.x* = aux_vector
-  }
+    for (auto iPoint = begin; iPoint < end; ++iPoint) {
+      auto idx = iPoint*nVar;
+      LowerProduct(prod, iPoint, begin, low_prod);        // Compute L.x*
+      VectorSubtraction(&vec[idx], low_prod, &prod[idx]); // Compute y = b - L.x*
+      Gauss_Elimination(iPoint, &prod[idx]);              // Solve D.x* = y
+    }
+  } // end parallel
 
   /*--- MPI Parallelization ---*/
+  SU2_OMP_MASTER
+  {
+    InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
+    CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  }
+  SU2_OMP_BARRIER
 
-  InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
-  CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  /*--- Second part of the symmetric iteration: (D+U).x_(1) = D.x* ---*/
+
+  /*--- OpenMP Parallelization ---*/
+  SU2_OMP_FOR_STAT(1)
+  for(unsigned long thread = 0; thread < omp_num_parts; ++thread)
+  {
+    const auto begin = omp_partitions[thread];
+    const auto row_end = omp_partitions[thread+1];
+    /*--- On the last thread partition the upper
+     *    product should consider halo columns. ---*/
+    const auto col_end = (row_end==nPointDomain)? nPoint : row_end;
+
+    ScalarType up_prod[MAXNVAR], dia_prod[MAXNVAR];
+
+    for (auto iPoint = row_end; iPoint > begin;) {
+      iPoint--; // because of unsigned type
+      auto idx = iPoint*nVar;
+      DiagonalProduct(prod, iPoint, dia_prod);          // Compute D.x*
+      UpperProduct(prod, iPoint, col_end, up_prod);     // Compute U.x_(n+1)
+      VectorSubtraction(dia_prod, up_prod, &prod[idx]); // Compute y = D.x*-U.x_(n+1)
+      Gauss_Elimination(iPoint, &prod[idx]);            // Solve D.x* = y
+    }
+  } // end parallel
 
+  /*--- MPI Parallelization ---*/
+  SU2_OMP_MASTER
+  {
+    InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
+    CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  }
+  SU2_OMP_BARRIER
 }
 
 template<class ScalarType>
-unsigned short CSysMatrix<ScalarType>::BuildLineletPreconditioner(CGeometry *geometry, CConfig *config) {
+unsigned long CSysMatrix<ScalarType>::BuildLineletPreconditioner(CGeometry *geometry, CConfig *config) {
+
+  assert(omp_get_thread_num()==0 && "Linelet preconditioner cannot be built by multiple threads.");
 
   bool add_point;
   unsigned long iEdge, iPoint, jPoint, index_Point, iLinelet, iVertex, next_Point, counter, iElem;
-  unsigned short iMarker, iNode, MeanPoints;
+  unsigned short iMarker, iNode;
   su2double alpha = 0.9, weight, max_weight, *normal, area, volume_iPoint, volume_jPoint;
   unsigned long Local_nPoints, Local_nLineLets, Global_nPoints, Global_nLineLets, max_nElem;
 
@@ -1003,6 +987,7 @@ unsigned short CSysMatrix<ScalarType>::BuildLineletPreconditioner(CGeometry *geo
 
   vector<bool> check_Point(nPoint,true);
 
+  LineletBool.clear();
   LineletBool.resize(nPoint,false);
 
   nLinelet = 0;
@@ -1142,141 +1127,139 @@ unsigned short CSysMatrix<ScalarType>::BuildLineletPreconditioner(CGeometry *geo
   }
   Local_nLineLets = nLinelet;
 
-#ifndef HAVE_MPI
-  Global_nPoints = Local_nPoints;
-  Global_nLineLets = Local_nLineLets;
-#else
   SU2_MPI::Allreduce(&Local_nPoints, &Global_nPoints, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
   SU2_MPI::Allreduce(&Local_nLineLets, &Global_nLineLets, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
-#endif
-
-  MeanPoints = SU2_TYPE::Int(ScalarType(Global_nPoints)/ScalarType(Global_nLineLets));
 
   /*--- Memory allocation --*/
 
-  LineletUpper.resize(max_nElem,NULL);
-  LineletInvDiag.resize(max_nElem*nVar*nVar,0.0);
-  LineletVector.resize(max_nElem*nVar,0.0);
+  LineletUpper.resize(omp_get_max_threads(), vector<const ScalarType*>(max_nElem,nullptr));
+  LineletVector.resize(omp_get_max_threads(), vector<ScalarType>(max_nElem*nVar,0.0));
+  LineletInvDiag.resize(omp_get_max_threads(), vector<ScalarType>(max_nElem*nVar*nVar,0.0));
 
-  return MeanPoints;
+  return (unsigned long)(passivedouble(Global_nPoints) / Global_nLineLets);
 
 }
 
 template<class ScalarType>
 void CSysMatrix<ScalarType>::ComputeLineletPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
-                                              CGeometry *geometry, CConfig *config) {
-
-  unsigned long iVar, iElem, nElem, iLinelet, iPoint, im1Point;
-  /*--- Pointers to lower, upper, and diagonal blocks ---*/
-  const ScalarType *l = NULL, *u = NULL, *d = NULL;
-  /*--- Inverse of d_{i-1}, modified d_i, modified b_i (rhs) ---*/
-  ScalarType *inv_dm1 = NULL, *d_prime = NULL, *b_prime = NULL;
-
-//  if (size != SINGLE_NODE)
-//    SU2_MPI::Error("Linelet not implemented in parallel.", CURRENT_FUNCTION);
+                                                          CGeometry *geometry, CConfig *config) const {
+  /*--- Coherent view of vectors. ---*/
+  SU2_OMP_BARRIER
 
   /*--- Jacobi preconditioning where there is no linelet ---*/
 
-  for (iPoint = 0; iPoint < nPointDomain; iPoint++)
+  SU2_OMP(for schedule(dynamic,omp_heavy_size) nowait)
+  for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++)
     if (!LineletBool[iPoint])
       MatrixVectorProduct(&(invM[iPoint*nVar*nVar]), &vec[iPoint*nVar], &prod[iPoint*nVar]);
 
-  /*--- MPI Parallelization ---*/
-
-  InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
-  CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  /*--- Solve each linelet using the Thomas algorithm ---*/
 
-  /*--- Solve linelet using the Thomas algorithm ---*/
+  SU2_OMP_FOR_DYN(1)
+  for (auto iLinelet = 0ul; iLinelet < nLinelet; iLinelet++) {
 
-  for (iLinelet = 0; iLinelet < nLinelet; iLinelet++) {
+    /*--- Get references to the working vectors allocated for this thread. ---*/
 
-    nElem = LineletPoint[iLinelet].size();
+    int thread = omp_get_thread_num();
+    vector<const ScalarType*>& lineletUpper = LineletUpper[thread];
+    vector<ScalarType>& lineletInvDiag = LineletInvDiag[thread];
+    vector<ScalarType>& lineletVector = LineletVector[thread];
 
     /*--- Initialize the solution vector with the rhs ---*/
 
-    for (iElem = 0; iElem < nElem; iElem++) {
-      iPoint = LineletPoint[iLinelet][iElem];
-      for (iVar = 0; iVar < nVar; iVar++)
-        LineletVector[iElem*nVar+iVar] = vec[iPoint*nVar+iVar];
+    auto nElem = LineletPoint[iLinelet].size();
+
+    for (auto iElem = 0ul; iElem < nElem; iElem++) {
+      auto iPoint = LineletPoint[iLinelet][iElem];
+      for (auto iVar = 0ul; iVar < nVar; iVar++)
+        lineletVector[iElem*nVar+iVar] = vec[iPoint*nVar+iVar];
     }
 
-    /*--- Forward pass, eliminate lower entries, modify diagonal and rhs ---*/
+    /*--- Forward pass, eliminate lower entries, modify diagonal and rhs. ---*/
+
+    /*--- Small temporaries. ---*/
+    ScalarType aux_block[MAXNVAR*MAXNVAR], aux_vector[MAXNVAR];
 
-    iPoint = LineletPoint[iLinelet][0];
-    d = GetBlock(iPoint, iPoint);
-    for (iVar = 0; iVar < nVar*nVar; ++iVar)
-      LineletInvDiag[iVar] = d[iVar];
+    /*--- Copy diagonal block for first point in this linelet. ---*/
+    MatrixCopy(&matrix[dia_ptr[LineletPoint[iLinelet][0]]*nVar*nVar],
+               lineletInvDiag.data());
 
-    for (iElem = 1; iElem < nElem; iElem++) {
+    for (auto iElem = 1ul; iElem < nElem; iElem++) {
 
       /*--- Setup pointers to required matrices and vectors ---*/
-      im1Point = LineletPoint[iLinelet][iElem-1];
-      iPoint = LineletPoint[iLinelet][iElem];
+      auto im1Point = LineletPoint[iLinelet][iElem-1];
+      auto iPoint = LineletPoint[iLinelet][iElem];
 
-      d = GetBlock(iPoint, iPoint);
-      l = GetBlock(iPoint, im1Point);
-      u = GetBlock(im1Point, iPoint);
+      auto d = &matrix[dia_ptr[iPoint]*nVar*nVar];
+      auto l = GetBlock(iPoint, im1Point);
+      auto u = GetBlock(im1Point, iPoint);
 
-      inv_dm1 = &LineletInvDiag[(iElem-1)*nVar*nVar];
-      d_prime = &LineletInvDiag[iElem*nVar*nVar];
-      b_prime = &LineletVector[iElem*nVar];
+      auto inv_dm1 = &lineletInvDiag[(iElem-1)*nVar*nVar];
+      auto d_prime = &lineletInvDiag[iElem*nVar*nVar];
+      auto b_prime = &lineletVector[iElem*nVar];
 
       /*--- Invert previous modified diagonal ---*/
-      MatrixInverse(inv_dm1, inv_dm1);
+      MatrixCopy(inv_dm1, aux_block);
+      MatrixInverse(aux_block, inv_dm1);
 
       /*--- Left-multiply by lower block to obtain the weight ---*/
-      MatrixMatrixProduct(l, inv_dm1, block_weight);
+      MatrixMatrixProduct(l, inv_dm1, aux_block);
 
       /*--- Multiply weight by upper block to modify current diagonal ---*/
-      MatrixMatrixProduct(block_weight, u, d_prime);
+      MatrixMatrixProduct(aux_block, u, d_prime);
       MatrixSubtraction(d, d_prime, d_prime);
 
       /*--- Update the rhs ---*/
-      MatrixVectorProduct(block_weight, &LineletVector[(iElem-1)*nVar], aux_vector);
+      MatrixVectorProduct(aux_block, &lineletVector[(iElem-1)*nVar], aux_vector);
       VectorSubtraction(b_prime, aux_vector, b_prime);
 
       /*--- Cache upper block pointer for the backward substitution phase ---*/
-      LineletUpper[iElem-1] = u;
+      lineletUpper[iElem-1] = u;
     }
 
     /*--- Backwards substitution, LineletVector becomes the solution ---*/
 
     /*--- x_n = d_n^{-1} * b_n ---*/
-    Gauss_Elimination(&LineletInvDiag[(nElem-1)*nVar*nVar], &LineletVector[(nElem-1)*nVar]);
+    Gauss_Elimination(&lineletInvDiag[(nElem-1)*nVar*nVar], &lineletVector[(nElem-1)*nVar]);
 
     /*--- x_i = d_i^{-1}*(b_i - u_i*x_{i+1}) ---*/
-    for (iElem = nElem-1; iElem > 0; --iElem) {
-      inv_dm1 = &LineletInvDiag[(iElem-1)*nVar*nVar];
-      MatrixVectorProduct(LineletUpper[iElem-1], &LineletVector[iElem*nVar], aux_vector);
-      VectorSubtraction(&LineletVector[(iElem-1)*nVar], aux_vector, aux_vector);
-      MatrixVectorProduct(inv_dm1, aux_vector, &LineletVector[(iElem-1)*nVar]);
+    for (auto iElem = nElem-1; iElem > 0; --iElem) {
+      auto inv_dm1 = &lineletInvDiag[(iElem-1)*nVar*nVar];
+      MatrixVectorProduct(lineletUpper[iElem-1], &lineletVector[iElem*nVar], aux_vector);
+      VectorSubtraction(&lineletVector[(iElem-1)*nVar], aux_vector, aux_vector);
+      MatrixVectorProduct(inv_dm1, aux_vector, &lineletVector[(iElem-1)*nVar]);
     }
 
     /*--- Copy results to product vector ---*/
 
-    for (iElem = 0; iElem < nElem; iElem++) {
-      iPoint = LineletPoint[iLinelet][iElem];
-      for (iVar = 0; iVar < nVar; iVar++)
-        prod[iPoint*nVar+iVar] = LineletVector[iElem*nVar+iVar];
+    for (auto iElem = 0ul; iElem < nElem; iElem++) {
+      auto iPoint = LineletPoint[iLinelet][iElem];
+      for (auto iVar = 0ul; iVar < nVar; iVar++)
+        prod[iPoint*nVar+iVar] = lineletVector[iElem*nVar+iVar];
     }
 
   }
 
   /*--- MPI Parallelization ---*/
 
-  InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
-  CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  SU2_OMP_MASTER
+  {
+    InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
+    CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  }
+  SU2_OMP_BARRIER
 
 }
 
 template<class ScalarType>
-void CSysMatrix<ScalarType>::ComputeResidual(const CSysVector<ScalarType> & sol, const CSysVector<ScalarType> & f, CSysVector<ScalarType> & res) {
-
+void CSysMatrix<ScalarType>::ComputeResidual(const CSysVector<ScalarType> & sol, const CSysVector<ScalarType> & f,
+                                             CSysVector<ScalarType> & res) const {
+  SU2_OMP_FOR_DYN(omp_heavy_size)
   for (unsigned long iPoint = 0; iPoint < nPointDomain; iPoint++) {
-    RowProduct(sol, iPoint);
-    VectorSubtraction(prod_row_vector, &f[iPoint*nVar], &res[iPoint*nVar]);
+    ScalarType aux_vec[MAXNVAR];
+    RowProduct(sol, iPoint, aux_vec);
+    VectorSubtraction(aux_vec, &f[iPoint*nVar], &res[iPoint*nVar]);
   }
-
 }
 
 template<class ScalarType>
@@ -1317,25 +1300,57 @@ void CSysMatrix<ScalarType>::EnforceSolutionAtNode(const unsigned long node_i, c
 
 }
 
+template<class ScalarType>
+template<class OtherType>
+void CSysMatrix<ScalarType>::MatrixMatrixAddition(OtherType alpha, const CSysMatrix<OtherType>& B) {
+
+  /*--- Check the sparse structure is shared between the two matrices,
+   *    comparing pointers is ok as they are obtained from CGeometry. ---*/
+  bool ok = (row_ptr == B.row_ptr) && (col_ind == B.col_ind) &&
+            (nVar == B.nVar) && (nEqn == B.nEqn) && (nnz == B.nnz);
+
+  if (!ok) {
+    SU2_OMP_MASTER
+    SU2_MPI::Error("Matrices do not have compatible sparsity.", CURRENT_FUNCTION);
+  }
+
+  SU2_OMP_FOR_STAT(omp_light_size)
+  for (auto i = 0ul; i < nnz*nVar*nEqn; ++i)
+    matrix[i] += PassiveAssign<ScalarType,OtherType>(alpha*B.matrix[i]);
+
+}
+
 template<class ScalarType>
 void CSysMatrix<ScalarType>::BuildPastixPreconditioner(CGeometry *geometry, CConfig *config,
                                                        unsigned short kind_fact, bool transposed) {
 #ifdef HAVE_PASTIX
-  pastix_wrapper.SetMatrix(nVar,nPoint,nPointDomain,row_ptr,col_ind,matrix);
-  pastix_wrapper.Factorize(geometry, config, kind_fact, transposed);
+  /*--- Pastix will launch nested threads. ---*/
+  SU2_OMP_MASTER
+  {
+    pastix_wrapper.SetMatrix(nVar,nPoint,nPointDomain,row_ptr,col_ind,matrix);
+    pastix_wrapper.Factorize(geometry, config, kind_fact, transposed);
+  }
+  SU2_OMP_BARRIER
 #else
+  SU2_OMP_MASTER
   SU2_MPI::Error("SU2 was not compiled with -DHAVE_PASTIX", CURRENT_FUNCTION);
 #endif
 }
 
 template<class ScalarType>
 void CSysMatrix<ScalarType>::ComputePastixPreconditioner(const CSysVector<ScalarType> & vec, CSysVector<ScalarType> & prod,
-                                                         CGeometry *geometry, CConfig *config) {
+                                                         CGeometry *geometry, CConfig *config) const {
 #ifdef HAVE_PASTIX
-  pastix_wrapper.Solve(vec,prod);
-  InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
-  CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  SU2_OMP_BARRIER
+  SU2_OMP_MASTER
+  {
+    pastix_wrapper.Solve(vec,prod);
+    InitiateComms(prod, geometry, config, SOLUTION_MATRIX);
+    CompleteComms(prod, geometry, config, SOLUTION_MATRIX);
+  }
+  SU2_OMP_BARRIER
 #else
+  SU2_OMP_MASTER
   SU2_MPI::Error("SU2 was not compiled with -DHAVE_PASTIX", CURRENT_FUNCTION);
 #endif
 }
@@ -1344,27 +1359,32 @@ void CSysMatrix<ScalarType>::ComputePastixPreconditioner(const CSysVector<Scalar
 template<>
 void CSysMatrix<su2double>::BuildPastixPreconditioner(CGeometry *geometry, CConfig *config,
                                                       unsigned short kind_fact, bool transposed) {
+  SU2_OMP_MASTER
   SU2_MPI::Error("The PaStiX preconditioner is only available in CSysMatrix<passivedouble>", CURRENT_FUNCTION);
 }
 template<>
 void CSysMatrix<su2double>::ComputePastixPreconditioner(const CSysVector<su2double> & vec, CSysVector<su2double> & prod,
-                                                        CGeometry *geometry, CConfig *config) {
+                                                        CGeometry *geometry, CConfig *config) const {
+  SU2_OMP_MASTER
   SU2_MPI::Error("The PaStiX preconditioner is only available in CSysMatrix<passivedouble>", CURRENT_FUNCTION);
 }
 #endif
 
 /*--- Explicit instantiations ---*/
 template class CSysMatrix<su2double>;
-template void  CSysMatrix<su2double>::InitiateComms(CSysVector<su2double>&, CGeometry*, CConfig*, unsigned short);
-template void  CSysMatrix<su2double>::CompleteComms(CSysVector<su2double>&, CGeometry*, CConfig*, unsigned short);
+template void  CSysMatrix<su2double>::InitiateComms(const CSysVector<su2double>&, CGeometry*, CConfig*, unsigned short) const;
+template void  CSysMatrix<su2double>::CompleteComms(CSysVector<su2double>&, CGeometry*, CConfig*, unsigned short) const;
 template void  CSysMatrix<su2double>::EnforceSolutionAtNode(unsigned long, const su2double*, CSysVector<su2double>&);
+template void  CSysMatrix<su2double>::MatrixMatrixAddition(su2double, const CSysMatrix<su2double>&);
 
 #ifdef CODI_REVERSE_TYPE
 template class CSysMatrix<passivedouble>;
-template void  CSysMatrix<passivedouble>::InitiateComms(CSysVector<passivedouble>&, CGeometry*, CConfig*, unsigned short);
-template void  CSysMatrix<passivedouble>::InitiateComms(CSysVector<su2double>&, CGeometry*, CConfig*, unsigned short);
-template void  CSysMatrix<passivedouble>::CompleteComms(CSysVector<passivedouble>&, CGeometry*, CConfig*, unsigned short);
-template void  CSysMatrix<passivedouble>::CompleteComms(CSysVector<su2double>&, CGeometry*, CConfig*, unsigned short);
+template void  CSysMatrix<passivedouble>::InitiateComms(const CSysVector<passivedouble>&, CGeometry*, CConfig*, unsigned short) const;
+template void  CSysMatrix<passivedouble>::InitiateComms(const CSysVector<su2double>&, CGeometry*, CConfig*, unsigned short) const;
+template void  CSysMatrix<passivedouble>::CompleteComms(CSysVector<passivedouble>&, CGeometry*, CConfig*, unsigned short) const;
+template void  CSysMatrix<passivedouble>::CompleteComms(CSysVector<su2double>&, CGeometry*, CConfig*, unsigned short) const;
 template void  CSysMatrix<passivedouble>::EnforceSolutionAtNode(unsigned long, const passivedouble*, CSysVector<passivedouble>&);
 template void  CSysMatrix<passivedouble>::EnforceSolutionAtNode(unsigned long, const su2double*, CSysVector<su2double>&);
+template void  CSysMatrix<passivedouble>::MatrixMatrixAddition(passivedouble, const CSysMatrix<passivedouble>&);
+template void  CSysMatrix<passivedouble>::MatrixMatrixAddition(su2double, const CSysMatrix<su2double>&);
 #endif
diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
index f1fbbc9a4803..b61135586ba3 100644
--- a/Common/src/linear_algebra/CSysSolve.cpp
+++ b/Common/src/linear_algebra/CSysSolve.cpp
@@ -6,7 +6,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -27,18 +27,29 @@
 
 #include "../../include/linear_algebra/CSysSolve.hpp"
 #include "../../include/linear_algebra/CSysSolve_b.hpp"
+#include "../../include/omp_structure.hpp"
+#include "../../include/option_structure.hpp"
+#include "../../include/config_structure.hpp"
+#include "../../include/geometry/CGeometry.hpp"
+#include "../../include/linear_algebra/CSysMatrix.hpp"
+#include "../../include/linear_algebra/CMatrixVectorProduct.hpp"
+#include "../../include/linear_algebra/CPreconditioner.hpp"
+
+#include <limits>
+
+const su2double eps = numeric_limits<passivedouble>::epsilon(); /*!< \brief machine epsilon */
 
 template<class ScalarType>
 CSysSolve<ScalarType>::CSysSolve(const bool mesh_deform_mode) : cg_ready(false), bcg_ready(false),
                                                                 gmres_ready(false), smooth_ready(false) {
   mesh_deform = mesh_deform_mode;
-  LinSysRes_ptr = NULL;
-  LinSysSol_ptr = NULL;
+  LinSysRes_ptr = nullptr;
+  LinSysSol_ptr = nullptr;
   Residual = 0.0;
 }
 
 template<class ScalarType>
-void CSysSolve<ScalarType>::ApplyGivens(const ScalarType & s, const ScalarType & c, ScalarType & h1, ScalarType & h2) {
+void CSysSolve<ScalarType>::ApplyGivens(ScalarType s, ScalarType c, ScalarType & h1, ScalarType & h2) const {
 
   ScalarType temp = c*h1 + s*h2;
   h2 = c*h2 - s*h1;
@@ -46,7 +57,7 @@ void CSysSolve<ScalarType>::ApplyGivens(const ScalarType & s, const ScalarType &
 }
 
 template<class ScalarType>
-void CSysSolve<ScalarType>::GenerateGivens(ScalarType & dx, ScalarType & dy, ScalarType & s, ScalarType & c) {
+void CSysSolve<ScalarType>::GenerateGivens(ScalarType & dx, ScalarType & dy, ScalarType & s, ScalarType & c) const {
 
   if ( (dx == 0.0) && (dy == 0.0) ) {
     c = 1.0;
@@ -76,8 +87,8 @@ void CSysSolve<ScalarType>::GenerateGivens(ScalarType & dx, ScalarType & dy, Sca
 }
 
 template<class ScalarType>
-void CSysSolve<ScalarType>::SolveReduced(const int & n, const vector<vector<ScalarType> > & Hsbg,
-                             const vector<ScalarType> & rhs, vector<ScalarType> & x) {
+void CSysSolve<ScalarType>::SolveReduced(int n, const vector<vector<ScalarType> > & Hsbg,
+                                         const vector<ScalarType> & rhs, vector<ScalarType> & x) const {
   // initialize...
   for (int i = 0; i < n; i++)
     x[i] = rhs[i];
@@ -91,73 +102,38 @@ void CSysSolve<ScalarType>::SolveReduced(const int & n, const vector<vector<Scal
 }
 
 template<class ScalarType>
-void CSysSolve<ScalarType>::ModGramSchmidt(int i, vector<vector<ScalarType> > & Hsbg, vector<CSysVector<ScalarType> > & w) {
-
-  bool Convergence = true;
+void CSysSolve<ScalarType>::ModGramSchmidt(int i, vector<vector<ScalarType> > & Hsbg,
+                                           vector<CSysVector<ScalarType> > & w) const {
 
   /*--- Parameter for reorthonormalization ---*/
 
-  static const ScalarType reorth = 0.98;
+  const ScalarType reorth = 0.98;
 
   /*--- Get the norm of the vector being orthogonalized, and find the
   threshold for re-orthogonalization ---*/
 
-  ScalarType nrm = dotProd(w[i+1], w[i+1]);
+  ScalarType nrm = w[i+1].squaredNorm();
   ScalarType thr = nrm*reorth;
 
   /*--- The norm of w[i+1] < 0.0 or w[i+1] = NaN ---*/
 
-  if ((nrm <= 0.0) || (nrm != nrm)) Convergence = false;
-
-  /*--- Synchronization point to check the convergence of the solver ---*/
-
-#ifdef HAVE_MPI
-
-  int rank = SU2_MPI::GetRank();
-  int size = SU2_MPI::GetSize();
-
-  unsigned short *sbuf_conv = NULL, *rbuf_conv = NULL;
-  sbuf_conv = new unsigned short[1]; sbuf_conv[0] = 0;
-  rbuf_conv = new unsigned short[1]; rbuf_conv[0] = 0;
-
-  /*--- Convergence criteria ---*/
-
-  sbuf_conv[0] = Convergence;
-  SU2_MPI::Reduce(sbuf_conv, rbuf_conv, 1, MPI_UNSIGNED_SHORT, MPI_SUM, MASTER_NODE, MPI_COMM_WORLD);
-
-  /*-- Compute global convergence criteria in the master node --*/
-
-  sbuf_conv[0] = 0;
-  if (rank == MASTER_NODE) {
-    if (rbuf_conv[0] == size) sbuf_conv[0] = 1;
-    else sbuf_conv[0] = 0;
-  }
-
-  SU2_MPI::Bcast(sbuf_conv, 1, MPI_UNSIGNED_SHORT, MASTER_NODE, MPI_COMM_WORLD);
-
-  if (sbuf_conv[0] == 1) Convergence = true;
-  else Convergence = false;
-
-  delete [] sbuf_conv;
-  delete [] rbuf_conv;
-
-#endif
-
-  if (!Convergence) {
-    SU2_MPI::Error("SU2 has diverged.", CURRENT_FUNCTION);
+  if ((nrm <= 0.0) || (nrm != nrm)) {
+    /*--- nrm is the result of a dot product, communications are implicitly handled. ---*/
+    SU2_OMP_MASTER
+    SU2_MPI::Error("FGMRES orthogonalization failed, linear solver diverged.", CURRENT_FUNCTION);
   }
 
   /*--- Begin main Gram-Schmidt loop ---*/
 
   for (int k = 0; k < i+1; k++) {
-    ScalarType prod = dotProd(w[i+1], w[k]);
+    ScalarType prod = w[i+1].dot(w[k]);
     Hsbg[k][i] = prod;
     w[i+1].Plus_AX(-prod, w[k]);
 
     /*--- Check if reorthogonalization is necessary ---*/
 
     if (prod*prod > thr) {
-      prod = dotProd(w[i+1], w[k]);
+      prod = w[i+1].dot(w[k]);
       Hsbg[k][i] += prod;
       w[i+1].Plus_AX(-prod, w[k]);
     }
@@ -181,44 +157,69 @@ void CSysSolve<ScalarType>::ModGramSchmidt(int i, vector<vector<ScalarType> > &
 }
 
 template<class ScalarType>
-void CSysSolve<ScalarType>::WriteHeader(const string & solver, const ScalarType & restol, const ScalarType & resinit) {
+void CSysSolve<ScalarType>::WriteHeader(string solver, ScalarType restol, ScalarType resinit) const {
 
-  cout << "\n# " << solver << " residual history" << endl;
-  cout << "# Residual tolerance target = " << restol << endl;
+  cout << "\n# " << solver << " residual history\n";
+  cout << "# Residual tolerance target = " << restol << "\n";
   cout << "# Initial residual norm     = " << resinit << endl;
+}
+
+template<class ScalarType>
+void CSysSolve<ScalarType>::WriteHistory(unsigned long iter, ScalarType res) const {
 
+  cout << "     " << iter << "     " << res << endl;
 }
 
 template<class ScalarType>
-void CSysSolve<ScalarType>::WriteHistory(const int & iter, const ScalarType & res, const ScalarType & resinit) {
+void CSysSolve<ScalarType>::WriteFinalResidual(string solver, unsigned long iter, ScalarType res) const {
 
-  cout << "     " << iter << "     " << res/resinit << endl;
+  cout << "# " << solver << " final (true) residual:\n";
+  cout << "# Iteration = " << iter << ": |res|/|res0| = " << res << ".\n" << endl;
+}
+
+template<class ScalarType>
+void CSysSolve<ScalarType>::WriteWarning(ScalarType res_calc, ScalarType res_true, ScalarType tol) const {
 
+  cout << "# WARNING:\n";
+  cout << "# true residual norm and calculated residual norm do not agree.\n";
+  cout << "# true_res = " << res_true << ", calc_res = " << res_calc << ", tol = " << tol*10 << ".\n";
+  cout << "# true_res - calc_res = " << res_true - res_calc << endl;
 }
 
 template<class ScalarType>
 unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> & b, CSysVector<ScalarType> & x,
-                                                  CMatrixVectorProduct<ScalarType> & mat_vec, CPreconditioner<ScalarType> & precond,
-                                                  ScalarType tol, unsigned long m, ScalarType *residual, bool monitoring, CConfig *config) {
+                                                  const CMatrixVectorProduct<ScalarType> & mat_vec, const CPreconditioner<ScalarType> & precond,
+                                                  ScalarType tol, unsigned long m, ScalarType & residual, bool monitoring, CConfig *config) const {
 
-  int rank = SU2_MPI::GetRank();
+  const bool master = (SU2_MPI::GetRank() == MASTER_NODE) && (omp_get_thread_num() == 0);
   ScalarType norm_r = 0.0, norm0 = 0.0;
-  int i = 0;
+  unsigned long i = 0;
 
   /*--- Check the subspace size ---*/
 
   if (m < 1) {
-    char buf[100];
-    SPRINTF(buf, "Illegal value for subspace size, m = %lu", m );
-    SU2_MPI::Error(string(buf), CURRENT_FUNCTION);
+    SU2_OMP_MASTER
+    SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION);
   }
 
-  /*--- Allocate if not allocated yet ---*/
+  /*--- Allocate if not allocated yet, only one thread can
+   *    do this since the working vectors are shared. ---*/
 
   if (!cg_ready) {
-    A_x = b;
-    z = b;
-    cg_ready = true;
+    SU2_OMP_MASTER
+    {
+      auto nVar = b.GetNVar();
+      auto nBlk = b.GetNBlk();
+      auto nBlkDomain = b.GetNBlkDomain();
+
+      A_x.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      r.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      z.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      p.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+
+      cg_ready = true;
+    }
+    SU2_OMP_BARRIER
   }
 
   /*--- Calculate the initial residual, compute norm, and check if system is already solved ---*/
@@ -233,7 +234,7 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
     norm_r = r.norm();
     norm0  = b.norm();
     if ((norm_r < tol*norm0) || (norm_r < eps)) {
-      if (rank == MASTER_NODE) cout << "CSysSolve::ConjugateGradient(): system solved by initial guess." << endl;
+      if (master) cout << "CSysSolve::ConjugateGradient(): system solved by initial guess." << endl;
       return 0;
     }
 
@@ -243,20 +244,21 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
 
     /*--- Output header information including initial residual ---*/
 
-    if ((monitoring) && (rank == MASTER_NODE)) {
+    if ((monitoring) && (master)) {
       WriteHeader("CG", tol, norm_r);
-      WriteHistory(i, norm_r, norm0);
+      WriteHistory(i, norm_r/norm0);
     }
 
   }
 
-  ScalarType alpha, beta, r_dot_z;
+  ScalarType alpha, beta, r_dot_z, r_dot_z_old;
   precond(r, z);
   p = z;
+  r_dot_z = r.dot(z);
 
   /*---  Loop over all search directions ---*/
 
-  for (i = 0; i < (int)m; i++) {
+  for (i = 0; i < m; i++) {
 
     /*--- Apply matrix to p to build Krylov subspace ---*/
 
@@ -264,9 +266,7 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
 
     /*--- Calculate step-length alpha ---*/
 
-    r_dot_z = dotProd(r, z);
-    alpha = dotProd(A_x, p);
-    alpha = r_dot_z / alpha;
+    alpha = r_dot_z / A_x.dot(p);
 
     /*--- Update solution and residual: ---*/
 
@@ -281,7 +281,8 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
 
       norm_r = r.norm();
       if (norm_r < tol*norm0) break;
-      if (((monitoring) && (rank == MASTER_NODE)) && ((i+1) % 10 == 0)) WriteHistory(i+1, norm_r, norm0);
+      if (((monitoring) && (master)) && ((i+1) % 10 == 0))
+        WriteHistory(i+1, norm_r/norm0);
 
     }
 
@@ -290,9 +291,9 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
     /*--- Calculate Gram-Schmidt coefficient beta,
      beta = dotProd(r_{i+1}, z_{i+1}) / dotProd(r_{i}, z_{i}) ---*/
 
-    beta = 1.0 / r_dot_z;
-    r_dot_z = dotProd(r, z);
-    beta *= r_dot_z;
+    r_dot_z_old = r_dot_z;
+    r_dot_z = r.dot(z);
+    beta = r_dot_z / r_dot_z_old;
 
     /*--- Gram-Schmidt orthogonalization; p = beta *p + z ---*/
 
@@ -304,52 +305,42 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
 
   if ((monitoring) && (config->GetComm_Level() == COMM_FULL)) {
 
-    if (rank == MASTER_NODE) {
-      cout << "# Conjugate Gradient final (true) residual:" << endl;
-      cout << "# Iteration = " << i << ": |res|/|res0| = "  << norm_r/norm0 << ".\n" << endl;
-    }
+    if (master) WriteFinalResidual("CG", i, norm_r/norm0);
 
     mat_vec(x, A_x);
     r = b; r -= A_x;
     ScalarType true_res = r.norm();
 
     if (fabs(true_res - norm_r) > tol*10.0) {
-      if (rank == MASTER_NODE) {
-        cout << "# WARNING in CSysSolve::CG_LinSolver(): " << endl;
-        cout << "# true residual norm and calculated residual norm do not agree." << endl;
-        cout << "# true_res = " << true_res <<", calc_res = " << norm_r <<", tol = " << tol*10 <<"."<< endl;
-        cout << "# true_res - calc_res = " << true_res - norm_r << endl;
+      if (master) {
+        WriteWarning(norm_r, true_res, tol);
       }
     }
 
   }
 
-  (*residual) = norm_r/norm0;
-  return (unsigned long) i;
+  residual = norm_r/norm0;
+  return i;
 
 }
 
 template<class ScalarType>
 unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarType> & b, CSysVector<ScalarType> & x,
-                                                      CMatrixVectorProduct<ScalarType> & mat_vec, CPreconditioner<ScalarType> & precond,
-                                                      ScalarType tol, unsigned long m, ScalarType *residual, bool monitoring, CConfig *config) {
+                                                      const CMatrixVectorProduct<ScalarType> & mat_vec, const CPreconditioner<ScalarType> & precond,
+                                                      ScalarType tol, unsigned long m, ScalarType & residual, bool monitoring, CConfig *config) const {
 
-  int rank = SU2_MPI::GetRank();
+  const bool master = (SU2_MPI::GetRank() == MASTER_NODE) && (omp_get_thread_num() == 0);
 
   /*---  Check the subspace size ---*/
 
   if (m < 1) {
-    char buf[100];
-    SPRINTF(buf, "Illegal value for subspace size, m = %lu", m );
-    SU2_MPI::Error(string(buf), CURRENT_FUNCTION);
+    SU2_OMP_MASTER
+    SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION);
   }
 
-  /*---  Check the subspace size ---*/
-
   if (m > 5000) {
-    char buf[100];
-    SPRINTF(buf, "Illegal value for subspace size (too high), m = %lu", m );
-    SU2_MPI::Error(string(buf), CURRENT_FUNCTION);
+    SU2_OMP_MASTER
+    SU2_MPI::Error("FGMRES subspace is too large.", CURRENT_FUNCTION);
   }
 
   /*--- Allocate if not allocated yet
@@ -357,12 +348,18 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
    a temporary CSysVector object for the copy constructor ---*/
 
   if (!gmres_ready) {
-    W.resize(m+1, x);
-    Z.resize(m+1, x);
-    gmres_ready = true;
+    SU2_OMP_MASTER
+    {
+      W.resize(m+1, x);
+      Z.resize(m+1, x);
+      gmres_ready = true;
+    }
+    SU2_OMP_BARRIER
   }
 
-  /*---  Define various arrays ---*/
+  /*--- Define various arrays. In parallel, each thread of each rank has and works
+   on its own thread, since calculations on these arrays are based on dot products
+   (reduced across all threads and ranks) all threads do the same computations. ---*/
 
   vector<ScalarType> g(m+1, 0.0);
   vector<ScalarType> sn(m+1, 0.0);
@@ -370,11 +367,11 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
   vector<ScalarType> y(m, 0.0);
   vector<vector<ScalarType> > H(m+1, vector<ScalarType>(m, 0.0));
 
-  /*---  Calculate the norm of the rhs vector ---*/
+  /*--- Calculate the norm of the rhs vector. ---*/
 
   ScalarType norm0 = b.norm();
 
-  /*---  Calculate the initial residual (actually the negative residual) and compute its norm ---*/
+  /*--- Calculate the initial residual (actually the negative residual) and compute its norm. ---*/
 
   mat_vec(x, W[0]);
   W[0] -= b;
@@ -383,19 +380,19 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
 
   if ((beta < tol*norm0) || (beta < eps)) {
 
-    /*---  System is already solved ---*/
+    /*--- System is already solved ---*/
 
-    if (rank == MASTER_NODE) cout << "CSysSolve::FGMRES(): system solved by initial guess." << endl;
-    (*residual) = beta;
+    if (master) cout << "CSysSolve::FGMRES(): system solved by initial guess." << endl;
+    residual = beta;
     return 0;
   }
 
-  /*---  Normalize residual to get w_{0} (the negative sign is because w[0]
-   holds the negative residual, as mentioned above) ---*/
+  /*--- Normalize residual to get w_{0} (the negative sign is because w[0]
+        holds the negative residual, as mentioned above). ---*/
 
   W[0] /= -beta;
 
-  /*---  Initialize the RHS of the reduced system ---*/
+  /*--- Initialize the RHS of the reduced system ---*/
 
   g[0] = beta;
 
@@ -403,17 +400,17 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
 
   norm0 = beta;
 
-  /*---  Output header information including initial residual ---*/
+  /*--- Output header information including initial residual ---*/
 
-  int i = 0;
-  if ((monitoring) && (rank == MASTER_NODE)) {
+  unsigned long i = 0;
+  if ((monitoring) && (master)) {
     WriteHeader("FGMRES", tol, beta);
-    WriteHistory(i, beta, norm0);
+    WriteHistory(i, beta/norm0);
   }
 
   /*---  Loop over all search directions ---*/
 
-  for (i = 0; i < (int)m; i++) {
+  for (i = 0; i < m; i++) {
 
     /*---  Check if solution has converged ---*/
 
@@ -434,7 +431,7 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
     /*---  Apply old Givens rotations to new column of the Hessenberg matrix then generate the
      new Givens rotation matrix and apply it to the last two elements of H[:][i] and g ---*/
 
-    for (int k = 0; k < i; k++)
+    for (unsigned long k = 0; k < i; k++)
       ApplyGivens(sn[k], cs[k], H[k][i], H[k+1][i]);
     GenerateGivens(H[i][i], H[i+1][i], sn[i], cs[i]);
     ApplyGivens(sn[i], cs[i], g[i], g[i+1]);
@@ -445,14 +442,14 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
 
     /*---  Output the relative residual if necessary ---*/
 
-    if ((((monitoring) && (rank == MASTER_NODE)) && ((i+1) % 10 == 0)) && (rank == MASTER_NODE)) WriteHistory(i+1, beta, norm0);
-
+    if ((((monitoring) && (master)) && ((i+1) % 10 == 0)) && (master))
+      WriteHistory(i+1, beta/norm0);
   }
 
   /*---  Solve the least-squares system and update solution ---*/
 
   SolveReduced(i, H, g, y);
-  for (int k = 0; k < i; k++) {
+  for (unsigned long k = 0; k < i; k++) {
     x.Plus_AX(y[k], Z[k]);
   }
 
@@ -460,56 +457,60 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
 
   if ((monitoring) && (config->GetComm_Level() == COMM_FULL)) {
 
-    if (rank == MASTER_NODE) {
-      cout << "# FGMRES final (true) residual:" << endl;
-      cout << "# Iteration = " << i << ": |res|/|res0| = " << beta/norm0 << ".\n" << endl;
-    }
+    if (master) WriteFinalResidual("FGMRES", i, beta/norm0);
 
     mat_vec(x, W[0]);
     W[0] -= b;
     ScalarType res = W[0].norm();
 
     if (fabs(res - beta) > tol*10) {
-      if (rank == MASTER_NODE) {
-        cout << "# WARNING in CSysSolve::FGMRES_LinSolver(): " << endl;
-        cout << "# true residual norm and calculated residual norm do not agree." << endl;
-        cout << "# res = " << res <<", beta = " << beta <<", tol = " << tol*10 <<"."<< endl;
-        cout << "# res - beta = " << res - beta << endl << endl;
+      if (master) {
+        WriteWarning(beta, res, tol);
       }
     }
 
   }
 
-  (*residual) = beta/norm0;
-  return (unsigned long) i;
+  residual = beta/norm0;
+  return i;
 
 }
 
 template<class ScalarType>
 unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarType> & b, CSysVector<ScalarType> & x,
-                                                       CMatrixVectorProduct<ScalarType> & mat_vec, CPreconditioner<ScalarType> & precond,
-                                                       ScalarType tol, unsigned long m, ScalarType *residual, bool monitoring, CConfig *config) {
+                                                       const CMatrixVectorProduct<ScalarType> & mat_vec, const CPreconditioner<ScalarType> & precond,
+                                                       ScalarType tol, unsigned long m, ScalarType & residual, bool monitoring, CConfig *config) const {
 
-  int rank = SU2_MPI::GetRank();
+  const bool master = (SU2_MPI::GetRank() == MASTER_NODE) && (omp_get_thread_num() == 0);
   ScalarType norm_r = 0.0, norm0 = 0.0;
-  int i = 0;
+  unsigned long i = 0;
 
   /*--- Check the subspace size ---*/
 
   if (m < 1) {
-    char buf[100];
-    SPRINTF(buf, "Illegal value for subspace size, m = %lu", m );
-    SU2_MPI::Error(string(buf), CURRENT_FUNCTION);
+    SU2_OMP_MASTER
+    SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION);
   }
 
   /*--- Allocate if not allocated yet ---*/
 
   if (!bcg_ready) {
-    A_x = b;
-    p = b;
-    z = b;
-    v = b;
-    bcg_ready = true;
+    SU2_OMP_MASTER
+    {
+      auto nVar = b.GetNVar();
+      auto nBlk = b.GetNBlk();
+      auto nBlkDomain = b.GetNBlkDomain();
+
+      A_x.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      r_0.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      r.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      p.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      v.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      z.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+
+      bcg_ready = true;
+    }
+    SU2_OMP_BARRIER
   }
 
   /*--- Calculate the initial residual, compute norm, and check if system is already solved ---*/
@@ -524,7 +525,7 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
     norm_r = r.norm();
     norm0  = b.norm();
     if ((norm_r < tol*norm0) || (norm_r < eps)) {
-      if (rank == MASTER_NODE) cout << "CSysSolve::BCGSTAB(): system solved by initial guess." << endl;
+      if (master) cout << "CSysSolve::BCGSTAB(): system solved by initial guess." << endl;
       return 0;
     }
 
@@ -534,9 +535,9 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
 
     /*--- Output header information including initial residual ---*/
 
-    if ((monitoring) && (rank == MASTER_NODE)) {
+    if ((monitoring) && (master)) {
       WriteHeader("BCGSTAB", tol, norm_r);
-      WriteHistory(i, norm_r, norm0);
+      WriteHistory(i, norm_r/norm0);
     }
 
   }
@@ -546,9 +547,9 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
   ScalarType alpha = 1.0, beta = 1.0, omega = 1.0, rho = 1.0, rho_prime = 1.0;
   p = ScalarType(0.0); v = ScalarType(0.0); r_0 = r;
 
-  /*---  Loop over all search directions ---*/
+  /*--- Loop over all search directions ---*/
 
-  for (i = 0; i < (int)m; i++) {
+  for (i = 0; i < m; i++) {
 
     /*--- Compute rho_prime ---*/
 
@@ -556,7 +557,7 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
 
     /*--- Compute rho_i ---*/
 
-    rho = dotProd(r, r_0);
+    rho = r.dot(r_0);
 
     /*--- Compute beta ---*/
 
@@ -566,7 +567,7 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
 
     ScalarType beta_omega = -beta*omega;
     p.Equals_AX_Plus_BY(beta, p, beta_omega, v);
-    p.Plus_AX(1.0, r);
+    p += r;
 
     /*--- Preconditioning step ---*/
 
@@ -575,7 +576,7 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
 
     /*--- Calculate step-length alpha ---*/
 
-    ScalarType r_0_v = dotProd(r_0, v);
+    ScalarType r_0_v = r_0.dot(v);
     alpha = rho / r_0_v;
 
     /*--- Update solution and residual: ---*/
@@ -592,7 +593,7 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
 
     /*--- Calculate step-length omega ---*/
 
-    omega = dotProd(A_x, r) / dotProd(A_x, A_x);
+    omega = A_x.dot(r) / A_x.squaredNorm();
 
     /*--- Update solution and residual: ---*/
 
@@ -609,7 +610,8 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
 
       norm_r = r.norm();
       if (norm_r < tol*norm0) break;
-      if (((monitoring) && (rank == MASTER_NODE)) && ((i+1) % 10 == 0) && (rank == MASTER_NODE)) WriteHistory(i+1, norm_r, norm0);
+      if (((monitoring) && (master)) && ((i+1) % 10 == 0) && (master))
+        WriteHistory(i+1, norm_r/norm0);
 
     }
 
@@ -619,34 +621,28 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
 
   if ((monitoring) && (config->GetComm_Level() == COMM_FULL)) {
 
-    if (rank == MASTER_NODE) {
-      cout << "# BCGSTAB final (true) residual:" << endl;
-      cout << "# Iteration = " << i << ": |res|/|res0| = "  << norm_r/norm0 << ".\n" << endl;
-    }
+    if (master) WriteFinalResidual("BCGSTAB", i, norm_r/norm0);
 
     mat_vec(x, A_x);
     r = b; r -= A_x;
     ScalarType true_res = r.norm();
 
-    if ((fabs(true_res - norm_r) > tol*10.0) && (rank == MASTER_NODE)) {
-      cout << "# WARNING in CSysSolve::BCGSTAB_LinSolver(): " << endl;
-      cout << "# true residual norm and calculated residual norm do not agree." << endl;
-      cout << "# true_res = " << true_res <<", calc_res = " << norm_r <<", tol = " << tol*10 <<"."<< endl;
-      cout << "# true_res - calc_res = " << true_res <<" "<< norm_r << endl;
+    if ((fabs(true_res - norm_r) > tol*10.0) && (master)) {
+      WriteWarning(norm_r, true_res, tol);
     }
 
   }
 
-  (*residual) = norm_r/norm0;
-  return (unsigned long) i;
+  residual = norm_r/norm0;
+  return i;
 }
 
 template<class ScalarType>
 unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarType> & b, CSysVector<ScalarType> & x,
-                                                        CMatrixVectorProduct<ScalarType> & mat_vec, CPreconditioner<ScalarType> & precond,
-                                                        ScalarType tol, unsigned long m, ScalarType *residual, bool monitoring, CConfig *config) {
+                                                        const CMatrixVectorProduct<ScalarType> & mat_vec, const CPreconditioner<ScalarType> & precond,
+                                                        ScalarType tol, unsigned long m, ScalarType & residual, bool monitoring, CConfig *config) const {
 
-  int rank = SU2_MPI::GetRank();
+  const bool master = (SU2_MPI::GetRank() == MASTER_NODE) && (omp_get_thread_num() == 0);
   ScalarType norm_r = 0.0, norm0 = 0.0;
   unsigned long i = 0;
 
@@ -654,18 +650,27 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
   ScalarType omega = SU2_TYPE::GetValue(config->GetLinear_Solver_Smoother_Relaxation());
 
   if (m < 1) {
-    char buf[100];
-    SPRINTF(buf, "Illegal value for smoothing iterations, m = %lu", m );
-    SU2_MPI::Error(string(buf), CURRENT_FUNCTION);
+    SU2_OMP_MASTER
+    SU2_MPI::Error("Number of linear solver iterations must be greater than 0.", CURRENT_FUNCTION);
   }
 
   /*--- Allocate vectors for residual (r), solution increment (z), and matrix-vector
    product (A_x), for the latter two this is done only on the first call to the method. ---*/
 
   if (!smooth_ready) {
-    z = b;
-    A_x = b;
-    smooth_ready = true;
+    SU2_OMP_MASTER
+    {
+      auto nVar = b.GetNVar();
+      auto nBlk = b.GetNBlk();
+      auto nBlkDomain = b.GetNBlkDomain();
+
+      A_x.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      r.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+      z.Initialize(nBlk, nBlkDomain, nVar, nullptr);
+
+      smooth_ready = true;
+    }
+    SU2_OMP_BARRIER
   }
 
   /*--- Compute the initial residual and check if the system is already solved (if in COMM_FULL mode). ---*/
@@ -680,7 +685,7 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
     norm_r = r.norm();
     norm0  = b.norm();
     if ( (norm_r < tol*norm0) || (norm_r < eps) ) {
-      if (rank == MASTER_NODE) cout << "CSysSolve::Smoother_LinSolver(): system solved by initial guess." << endl;
+      if (master) cout << "CSysSolve::Smoother_LinSolver(): system solved by initial guess." << endl;
       return 0;
     }
 
@@ -690,11 +695,9 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
 
     /*--- Output header information including initial residual. ---*/
 
-    if ((monitoring) && (rank == MASTER_NODE)) {
-      cout << "\n# " << "Smoother" << " residual history" << endl;
-      cout << "# Residual tolerance target = " << tol << endl;
-      cout << "# Initial residual norm     = " << norm_r << endl;
-      cout << "     " << i << "     " << norm_r/norm0 << endl;
+    if ((monitoring) && (master)) {
+      WriteHeader("Smoother", tol, norm_r);
+      WriteHistory(i, norm_r/norm0);
     }
 
   }
@@ -730,22 +733,21 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
     if (config->GetComm_Level() == COMM_FULL) {
       norm_r = r.norm();
       if (norm_r < tol*norm0) break;
-      if (((monitoring) && (rank == MASTER_NODE)) && ((i+1) % 5 == 0))
-        cout << "     " << i << "     " << norm_r/norm0 << endl;
+      if (((monitoring) && (master)) && ((i+1) % 5 == 0))
+        WriteHistory(i+1, norm_r/norm0);
     }
   }
 
-  if ((monitoring) && (rank == MASTER_NODE) && (config->GetComm_Level() == COMM_FULL)) {
-    cout << "# Smoother final (true) residual:" << endl;
-    cout << "# Iteration = " << i << ": |res|/|res0| = "  << norm_r/norm0 << ".\n" << endl;
+  if ((monitoring) && (master) && (config->GetComm_Level() == COMM_FULL)) {
+    WriteFinalResidual("Smoother", i, norm_r/norm0);
   }
 
-  (*residual) = norm_r/norm0;
+  residual = norm_r/norm0;
   return i;
 }
 
 template<>
-void CSysSolve<su2double>::HandleTemporariesIn(CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol) {
+void CSysSolve<su2double>::HandleTemporariesIn(const CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol) {
 
   /*--- When the type is the same the temporaties are not required ---*/
   /*--- Set the pointers ---*/
@@ -758,13 +760,13 @@ void CSysSolve<su2double>::HandleTemporariesOut(CSysVector<su2double> & LinSysSo
 
   /*--- When the type is the same the temporaties are not required ---*/
   /*--- Reset the pointers ---*/
-  LinSysRes_ptr = NULL;
-  LinSysSol_ptr = NULL;
+  LinSysRes_ptr = nullptr;
+  LinSysSol_ptr = nullptr;
 }
 
 #ifdef CODI_REVERSE_TYPE
 template<>
-void CSysSolve<passivedouble>::HandleTemporariesIn(CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol) {
+void CSysSolve<passivedouble>::HandleTemporariesIn(const CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol) {
 
   /*--- When the type is different we need to copy data to the temporaries ---*/
   /*--- Copy data, the solution is also copied because it serves as initial conditions ---*/
@@ -784,13 +786,13 @@ void CSysSolve<passivedouble>::HandleTemporariesOut(CSysVector<su2double> & LinS
   LinSysSol.PassiveCopy(LinSysSol_tmp);
 
   /*--- Reset the pointers ---*/
-  LinSysRes_ptr = NULL;
-  LinSysSol_ptr = NULL;
+  LinSysRes_ptr = nullptr;
+  LinSysSol_ptr = nullptr;
 }
 #endif
 
 template<class ScalarType>
-unsigned long CSysSolve<ScalarType>::Solve(CSysMatrix<ScalarType> & Jacobian, CSysVector<su2double> & LinSysRes,
+unsigned long CSysSolve<ScalarType>::Solve(CSysMatrix<ScalarType> & Jacobian, const CSysVector<su2double> & LinSysRes,
                                            CSysVector<su2double> & LinSysSol, CGeometry *geometry, CConfig *config) {
   /*---
    A word about the templated types. It is assumed that the residual and solution vectors are always of su2doubles,
@@ -803,8 +805,8 @@ unsigned long CSysSolve<ScalarType>::Solve(CSysMatrix<ScalarType> & Jacobian, CS
   ---*/
 
   unsigned short KindSolver, KindPrecond;
-  unsigned long MaxIter, RestartIter, IterLinSol = 0;
-  ScalarType SolverTol, Norm0 = 0.0;
+  unsigned long MaxIter, RestartIter;
+  ScalarType SolverTol;
   bool ScreenOutput;
 
   /*--- Normal mode ---*/
@@ -852,68 +854,88 @@ unsigned long CSysSolve<ScalarType>::Solve(CSysMatrix<ScalarType> & Jacobian, CS
 
   HandleTemporariesIn(LinSysRes, LinSysSol);
 
-  CMatrixVectorProduct<ScalarType>* mat_vec = new CSysMatrixVectorProduct<ScalarType>(Jacobian, geometry, config);
-  CPreconditioner<ScalarType>* precond = NULL;
+  auto mat_vec = CSysMatrixVectorProduct<ScalarType>(Jacobian, geometry, config);
+  CPreconditioner<ScalarType>* precond = nullptr;
 
   switch (KindPrecond) {
     case JACOBI:
-      Jacobian.BuildJacobiPreconditioner();
-      precond = new CJacobiPreconditioner<ScalarType>(Jacobian, geometry, config);
+      precond = new CJacobiPreconditioner<ScalarType>(Jacobian, geometry, config, false);
       break;
     case ILU:
-      Jacobian.BuildILUPreconditioner();
-      precond = new CILUPreconditioner<ScalarType>(Jacobian, geometry, config);
+      precond = new CILUPreconditioner<ScalarType>(Jacobian, geometry, config, false);
       break;
     case LU_SGS:
       precond = new CLU_SGSPreconditioner<ScalarType>(Jacobian, geometry, config);
       break;
     case LINELET:
-      Jacobian.BuildJacobiPreconditioner();
       precond = new CLineletPreconditioner<ScalarType>(Jacobian, geometry, config);
       break;
     case PASTIX_ILU: case PASTIX_LU_P: case PASTIX_LDLT_P:
-      Jacobian.BuildPastixPreconditioner(geometry, config, KindPrecond);
-      precond = new CPastixPreconditioner<ScalarType>(Jacobian, geometry, config);
+      precond = new CPastixPreconditioner<ScalarType>(Jacobian, geometry, config, KindPrecond, false);
       break;
     default:
-      Jacobian.BuildJacobiPreconditioner();
-      precond = new CJacobiPreconditioner<ScalarType>(Jacobian, geometry, config);
+      precond = new CJacobiPreconditioner<ScalarType>(Jacobian, geometry, config, false);
       break;
   }
 
-  switch (KindSolver) {
-    case BCGSTAB:
-      IterLinSol = BCGSTAB_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, *mat_vec, *precond, SolverTol, MaxIter, &Residual, ScreenOutput, config);
-      break;
-    case FGMRES:
-      IterLinSol = FGMRES_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, *mat_vec, *precond, SolverTol, MaxIter, &Residual, ScreenOutput, config);
-      break;
-    case CONJUGATE_GRADIENT:
-      IterLinSol = CG_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, *mat_vec, *precond, SolverTol, MaxIter, &Residual, ScreenOutput, config);
-      break;
-    case RESTARTED_FGMRES:
-      IterLinSol = 0;
-      Norm0 = LinSysRes_ptr->norm();
-      while (IterLinSol < MaxIter) {
-        /*--- Enforce a hard limit on total number of iterations ---*/
-        unsigned long IterLimit = min(RestartIter, MaxIter-IterLinSol);
-        IterLinSol += FGMRES_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, *mat_vec, *precond, SolverTol, IterLimit, &Residual, ScreenOutput, config);
-        if ( Residual < SolverTol*Norm0 ) break;
-      }
-      break;
-    case SMOOTHER:
-      IterLinSol = Smoother_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, *mat_vec, *precond, SolverTol, MaxIter, &Residual, ScreenOutput, config);
-      break;
-    case PASTIX_LDLT : case PASTIX_LU:
-      Jacobian.BuildPastixPreconditioner(geometry, config, KindSolver);
-      Jacobian.ComputePastixPreconditioner(*LinSysRes_ptr, *LinSysSol_ptr, geometry, config);
-      IterLinSol = 1;
-      break;
-    default:
-      SU2_MPI::Error("Unknown type of linear solver.",CURRENT_FUNCTION);
-  }
+  /*--- Start a thread-parallel section covering the preparation of the
+   *    preconditioner and the solution of the linear solver.
+   *    Beware of shared variables, i.e. defined outside the section or
+   *    members of ANY class used therein, they should be treated as
+   *    read-only or explicitly synchronized if written to. ---*/
+
+  unsigned long IterLinSol = 0;
+
+  SU2_OMP_PARALLEL
+  {
+    /*--- Build preconditioner in parallel. ---*/
+    precond->Build();
+
+    /*--- Thread-local variables. ---*/
+    unsigned long iter = 0;
+    ScalarType residual = 0.0, norm0 = 0.0;
+
+    switch (KindSolver) {
+      case BCGSTAB:
+        iter = BCGSTAB_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, residual, ScreenOutput, config);
+        break;
+      case FGMRES:
+        iter = FGMRES_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, residual, ScreenOutput, config);
+        break;
+      case CONJUGATE_GRADIENT:
+        iter = CG_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, residual, ScreenOutput, config);
+        break;
+      case RESTARTED_FGMRES:
+        norm0 = LinSysRes_ptr->norm();
+        while (iter < MaxIter) {
+          /*--- Enforce a hard limit on total number of iterations ---*/
+          unsigned long IterLimit = min(RestartIter, MaxIter-iter);
+          iter += FGMRES_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, IterLimit, residual, ScreenOutput, config);
+          if ( residual < SolverTol*norm0 ) break;
+        }
+        break;
+      case SMOOTHER:
+        iter = Smoother_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, residual, ScreenOutput, config);
+        break;
+      case PASTIX_LDLT : case PASTIX_LU:
+        Jacobian.BuildPastixPreconditioner(geometry, config, KindSolver);
+        Jacobian.ComputePastixPreconditioner(*LinSysRes_ptr, *LinSysSol_ptr, geometry, config);
+        iter = 1;
+        break;
+      default:
+        SU2_MPI::Error("Unknown type of linear solver.",CURRENT_FUNCTION);
+    }
+
+    /*--- Only one thread modifies shared variables, synchronization
+     *    is not required as we are exiting the parallel section. ---*/
+    SU2_OMP_MASTER
+    {
+      IterLinSol = iter;
+      Residual = residual;
+    }
+
+  } // end SU2_OMP_PARALLEL
 
-  delete mat_vec;
   delete precond;
 
   HandleTemporariesOut(LinSysSol);
@@ -964,7 +986,7 @@ unsigned long CSysSolve<ScalarType>::Solve(CSysMatrix<ScalarType> & Jacobian, CS
 }
 
 template<class ScalarType>
-unsigned long CSysSolve<ScalarType>::Solve_b(CSysMatrix<ScalarType> & Jacobian, CSysVector<su2double> & LinSysRes,
+unsigned long CSysSolve<ScalarType>::Solve_b(CSysMatrix<ScalarType> & Jacobian, const CSysVector<su2double> & LinSysRes,
                                              CSysVector<su2double> & LinSysSol, CGeometry *geometry, CConfig *config) {
 #ifdef CODI_REVERSE_TYPE
 
@@ -999,21 +1021,21 @@ unsigned long CSysSolve<ScalarType>::Solve_b(CSysMatrix<ScalarType> & Jacobian,
 
   /*--- Set up preconditioner and matrix-vector product ---*/
 
-  CPreconditioner<ScalarType>* precond  = NULL;
+  CPreconditioner<ScalarType>* precond  = nullptr;
 
   switch(KindPrecond) {
     case ILU:
-      precond = new CILUPreconditioner<ScalarType>(Jacobian, geometry, config);
+      precond = new CILUPreconditioner<ScalarType>(Jacobian, geometry, config, RequiresTranspose);
       break;
     case JACOBI:
-      precond = new CJacobiPreconditioner<ScalarType>(Jacobian, geometry, config);
+      precond = new CJacobiPreconditioner<ScalarType>(Jacobian, geometry, config, RequiresTranspose);
       break;
     case PASTIX_ILU: case PASTIX_LU_P: case PASTIX_LDLT_P:
-      precond = new CPastixPreconditioner<ScalarType>(Jacobian, geometry, config);
+      precond = new CPastixPreconditioner<ScalarType>(Jacobian, geometry, config, KindPrecond, RequiresTranspose);
       break;
   }
 
-  CMatrixVectorProduct<ScalarType>* mat_vec = new CSysMatrixVectorProductTransposed<ScalarType>(Jacobian, geometry, config);
+  auto mat_vec = CSysMatrixVectorProductTransposed<ScalarType>(Jacobian, geometry, config);
 
   /*--- Solve the system ---*/
 
@@ -1021,13 +1043,13 @@ unsigned long CSysSolve<ScalarType>::Solve_b(CSysMatrix<ScalarType> & Jacobian,
 
   switch(KindSolver) {
     case FGMRES:
-      IterLinSol = FGMRES_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, *mat_vec, *precond, SolverTol , MaxIter, &Residual, ScreenOutput, config);
+      IterLinSol = FGMRES_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol , MaxIter, Residual, ScreenOutput, config);
       break;
     case BCGSTAB:
-      IterLinSol = BCGSTAB_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, *mat_vec, *precond, SolverTol , MaxIter, &Residual, ScreenOutput, config);
+      IterLinSol = BCGSTAB_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol , MaxIter, Residual, ScreenOutput, config);
       break;
     case CONJUGATE_GRADIENT:
-      IterLinSol = CG_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, *mat_vec, *precond, SolverTol, MaxIter, &Residual, ScreenOutput, config);
+      IterLinSol = CG_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol, MaxIter, Residual, ScreenOutput, config);
       break;
     case RESTARTED_FGMRES:
       IterLinSol = 0;
@@ -1035,7 +1057,7 @@ unsigned long CSysSolve<ScalarType>::Solve_b(CSysMatrix<ScalarType> & Jacobian,
       while (IterLinSol < MaxIter) {
         /*--- Enforce a hard limit on total number of iterations ---*/
         unsigned long IterLimit = min(RestartIter, MaxIter-IterLinSol);
-        IterLinSol += FGMRES_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, *mat_vec, *precond, SolverTol , IterLimit, &Residual, ScreenOutput, config);
+        IterLinSol += FGMRES_LinSolver(*LinSysRes_ptr, *LinSysSol_ptr, mat_vec, *precond, SolverTol , IterLimit, Residual, ScreenOutput, config);
         if ( Residual < SolverTol*Norm0 ) break;
       }
       break;
@@ -1051,7 +1073,6 @@ unsigned long CSysSolve<ScalarType>::Solve_b(CSysMatrix<ScalarType> & Jacobian,
 
   HandleTemporariesOut(LinSysSol);
 
-  delete mat_vec;
   delete precond;
 
   return IterLinSol;
diff --git a/Common/src/linear_algebra/CSysVector.cpp b/Common/src/linear_algebra/CSysVector.cpp
index fb44e57f6b12..86c68e4448af 100644
--- a/Common/src/linear_algebra/CSysVector.cpp
+++ b/Common/src/linear_algebra/CSysVector.cpp
@@ -1,12 +1,12 @@
 /*!
- * \file vector_structure.cpp
+ * \file CSysVector.cpp
  * \brief Main classes required for solving linear systems of equations
  * \author F. Palacios, J. Hicken
  * \version 7.0.0 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -26,472 +26,243 @@
  */
 
 #include "../../include/linear_algebra/CSysVector.hpp"
+#include "../../include/mpi_structure.hpp"
+#include "../../include/omp_structure.hpp"
+#include "../../include/toolboxes/allocation_toolbox.hpp"
+
+/*!
+ * \brief OpenMP worksharing construct used in CSysVector for loops.
+ * \note The loop will only run in parallel if methods are called from a
+ * parallel region (if not the results will still be correct).
+ * Static schedule to reduce overhead, chunk size determined at initialization.
+ * "nowait" clause is safe when calling CSysVector methods after each other
+ * as the loop size is the same. Methods of other classes that operate on a
+ * CSysVector and do not have the same work scheduling must use a
+ * SU2_OMP_BARRIER before using the vector.
+ */
+#define PARALLEL_FOR SU2_OMP(for schedule(static,omp_chunk_size) nowait)
 
 template<class ScalarType>
 CSysVector<ScalarType>::CSysVector(void) {
 
-  nElm = 0; nElmDomain = 0;
-  nBlk = 0; nBlkDomain = 0;
-  nVar = 0;
-  
-  vec_val = NULL;
+  vec_val = nullptr;
   nElm = 0;
   nElmDomain = 0;
   nVar = 0;
-  nBlk = 0;
-  nBlkDomain = 0;
-
+  omp_chunk_size = OMP_MAX_SIZE;
+  dotRes = 0.0;
 }
 
 template<class ScalarType>
-CSysVector<ScalarType>::CSysVector(const unsigned long & size, const ScalarType & val) {
+void CSysVector<ScalarType>::Initialize(unsigned long numBlk, unsigned long numBlkDomain,
+                                        unsigned long numVar, const ScalarType* val, bool valIsArray) {
 
-  nElm = size; nElmDomain = size;
-  nBlk = nElm; nBlkDomain = nElmDomain;
-  nVar = 1;
+  /*--- Assert that this method is only called by one thread. ---*/
+  assert(omp_get_thread_num()==0 && "Only the master thread is allowed to initialize the vector.");
 
-  /*--- Check for invalid size, then allocate memory and initialize values ---*/
-  if ( (nElm >= ULONG_MAX) ) {
-    char buf[100];
-    SPRINTF(buf, "Invalid input: size = %lu", size );
-    SU2_MPI::Error(string(buf), CURRENT_FUNCTION);
+  if ((nElm != numBlk*numVar) && (vec_val != nullptr)) {
+    MemoryAllocation::aligned_free(vec_val);
+    vec_val = nullptr;
   }
 
-  vec_val = new ScalarType[nElm];
-  for (unsigned int i = 0; i < nElm; i++)
-    vec_val[i] = val;
-
-#ifdef HAVE_MPI
-  unsigned long nElmLocal = (unsigned long)nElm;
-  SU2_MPI::Allreduce(&nElmLocal, &nElmGlobal, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
-#endif
-
-}
-
-template<class ScalarType>
-CSysVector<ScalarType>::CSysVector(const unsigned long & numBlk, const unsigned long & numBlkDomain, const unsigned short & numVar,
-                       const ScalarType & val) {
-
-  nElm = numBlk*numVar; nElmDomain = numBlkDomain*numVar;
-  nBlk = numBlk; nBlkDomain = numBlkDomain;
+  nElm = numBlk*numVar;
+  nElmDomain = numBlkDomain*numVar;
   nVar = numVar;
 
-  /*--- Check for invalid input, then allocate memory and initialize values ---*/
-  if ( nElm >= ULONG_MAX ) {
-    char buf[100];
-    SPRINTF(buf, "invalid input: numBlk, numVar = %lu, %u", numBlk, numVar );
-    SU2_MPI::Error(string(buf), CURRENT_FUNCTION);
-  }
-
-  vec_val = new ScalarType[nElm];
-  for (unsigned int i = 0; i < nElm; i++)
-    vec_val[i] = val;
-
-#ifdef HAVE_MPI
-  unsigned long nElmLocal = (unsigned long)nElm;
-  SU2_MPI::Allreduce(&nElmLocal, &nElmGlobal, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
-#endif
-
-}
-
-template<class ScalarType>
-CSysVector<ScalarType>::CSysVector(const CSysVector<ScalarType> & u) {
-
-  /*--- Copy size information, allocate memory, and initialize values ---*/
-  nElm = u.nElm; nElmDomain = u.nElmDomain;
-  nBlk = u.nBlk; nBlkDomain = u.nBlkDomain;
-  nVar = u.nVar;
-
-  vec_val = new ScalarType[nElm];
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] = u.vec_val[i];
-
-#ifdef HAVE_MPI
-  nElmGlobal = u.nElmGlobal;
-#endif
-
-}
+  omp_chunk_size = computeStaticChunkSize(nElm, omp_get_max_threads(), OMP_MAX_SIZE);
 
-template<class ScalarType>
-CSysVector<ScalarType>::CSysVector(const unsigned long & size, const ScalarType* u_array) {
+  if (vec_val == nullptr)
+    vec_val = MemoryAllocation::aligned_alloc<ScalarType>(64, nElm*sizeof(ScalarType));
 
-  nElm = size; nElmDomain = size;
-  nBlk = nElm; nBlkDomain = nElmDomain;
-  nVar = 1;
-
-  /*--- Check for invalid size, then allocate memory and initialize values ---*/
-  if ( nElm >= ULONG_MAX ) {
-    char buf[100];
-    SPRINTF(buf, "Invalid input: size = %lu", size );
-    SU2_MPI::Error(string(buf), CURRENT_FUNCTION);
+  if(val != nullptr) {
+    if(!valIsArray) {
+      for(auto i=0ul; i<nElm; i++) vec_val[i] = *val;
+    }
+    else {
+      for(auto i=0ul; i<nElm; i++) vec_val[i] = val[i];
+    }
   }
-
-  vec_val = new ScalarType[nElm];
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] = u_array[i];
-
-#ifdef HAVE_MPI
-  unsigned long nElmLocal = (unsigned long)nElm;
-  SU2_MPI::Allreduce(&nElmLocal, &nElmGlobal, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
-#endif
-
 }
 
 template<class ScalarType>
-CSysVector<ScalarType>::CSysVector(const unsigned long & numBlk, const unsigned long & numBlkDomain, const unsigned short & numVar,
-                       const ScalarType* u_array) {
-
-  nElm = numBlk*numVar; nElmDomain = numBlkDomain*numVar;
-  nBlk = numBlk; nBlkDomain = numBlkDomain;
-  nVar = numVar;
+template<class T>
+void CSysVector<ScalarType>::PassiveCopy(const CSysVector<T>& other) {
 
-  /*--- check for invalid input, then allocate memory and initialize values ---*/
-  if ( nElm >= ULONG_MAX ) {
-    char buf[100];
-    SPRINTF(buf, "invalid input: numBlk, numVar = %lu, %u", numBlk, numVar );
-    SU2_MPI::Error(string(buf), CURRENT_FUNCTION);
-  }
+  /*--- This is a method and not the overload of an operator to make sure who
+   calls it knows the consequence to the derivative information (lost) ---*/
 
-  vec_val = new ScalarType[nElm];
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] = u_array[i];
+  /*--- check if self-assignment, otherwise perform deep copy ---*/
+  if ((const void*)this == (const void*)&other) return;
 
-#ifdef HAVE_MPI
-  unsigned long nElmLocal = (unsigned long)nElm;
-  SU2_MPI::Allreduce(&nElmLocal, &nElmGlobal, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
-#endif
+  SU2_OMP_MASTER
+  Initialize(other.GetNBlk(), other.GetNBlkDomain(), other.GetNVar(), nullptr, true);
+  SU2_OMP_BARRIER
 
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++)
+    vec_val[i] = SU2_TYPE::GetValue(other[i]);
 }
 
 template<class ScalarType>
 CSysVector<ScalarType>::~CSysVector() {
-  delete [] vec_val;
-
-  nElm = 0; nElmDomain = 0;
-  nBlk = 0; nBlkDomain = 0;
-  nVar = 0;
 
+  if (vec_val != nullptr)
+    MemoryAllocation::aligned_free(vec_val);
 }
 
 template<class ScalarType>
-void CSysVector<ScalarType>::Initialize(const unsigned long & numBlk, const unsigned long & numBlkDomain, const unsigned short & numVar, const ScalarType & val) {
-
-  nElm = numBlk*numVar; nElmDomain = numBlkDomain*numVar;
-  nBlk = numBlk; nBlkDomain = numBlkDomain;
-  nVar = numVar;
-
-  /*--- Check for invalid input, then allocate memory and initialize values ---*/
-  if ( nElm >= ULONG_MAX ) {
-    char buf[100];
-    SPRINTF(buf, "invalid input: numBlk, numVar = %lu, %u", numBlk, numVar );
-    SU2_MPI::Error(string(buf), CURRENT_FUNCTION);
-  }
+void CSysVector<ScalarType>::Equals_AX(ScalarType a, const CSysVector<ScalarType> & x) {
 
-  vec_val = new ScalarType[nElm];
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] = val;
-
-#ifdef HAVE_MPI
-  unsigned long nElmLocal = (unsigned long)nElm;
-  SU2_MPI::Allreduce(&nElmLocal, &nElmGlobal, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
-#endif
+  assert(nElm == x.nElm && "Sizes do not match");
 
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++) vec_val[i] = a * x.vec_val[i];
 }
 
 template<class ScalarType>
-void CSysVector<ScalarType>::Equals_AX(const ScalarType & a, CSysVector<ScalarType> & x) {
-  /*--- check that *this and x are compatible ---*/
-  if (nElm != x.nElm) {
-    cerr << "CSysVector::Equals_AX(): " << "sizes do not match";
-    throw(-1);
-  }
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] = a * x.vec_val[i];
-}
+void CSysVector<ScalarType>::Plus_AX(ScalarType a, const CSysVector<ScalarType> & x) {
 
-template<class ScalarType>
-void CSysVector<ScalarType>::Plus_AX(const ScalarType & a, CSysVector<ScalarType> & x) {
-  /*--- check that *this and x are compatible ---*/
-  if (nElm != x.nElm) {
-    SU2_MPI::Error("Sizes do not match", CURRENT_FUNCTION);
-  }
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] += a * x.vec_val[i];
+  assert(nElm == x.nElm && "Sizes do not match");
+
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++) vec_val[i] += a * x.vec_val[i];
 }
 
 template<class ScalarType>
-void CSysVector<ScalarType>::Equals_AX_Plus_BY(const ScalarType & a, CSysVector<ScalarType> & x, const ScalarType & b, CSysVector<ScalarType> & y) {
-  /*--- check that *this, x and y are compatible ---*/
-  if ((nElm != x.nElm) || (nElm != y.nElm)) {
-    SU2_MPI::Error("Sizes do not match", CURRENT_FUNCTION);
-  }
-  for (unsigned long i = 0; i < nElm; i++)
+void CSysVector<ScalarType>::Equals_AX_Plus_BY(ScalarType a, const CSysVector<ScalarType> & x,
+                                               ScalarType b, const CSysVector<ScalarType> & y) {
+  assert(nElm == x.nElm && nElm == y.nElm && "Sizes do not match");
+
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++)
     vec_val[i] = a * x.vec_val[i] + b * y.vec_val[i];
 }
 
 template<class ScalarType>
 CSysVector<ScalarType> & CSysVector<ScalarType>::operator=(const CSysVector<ScalarType> & u) {
 
-  /*--- check if self-assignment, otherwise perform deep copy ---*/
-  if (this == &u) return *this;
-
-  /*--- determine if (re-)allocation is needed ---*/
-  if (nElm != u.nElm && vec_val != NULL) {delete [] vec_val; vec_val = NULL;}
-  if (vec_val == NULL) vec_val = new ScalarType[u.nElm];
-
-  /*--- copy ---*/
-  nElm = u.nElm;
-  nElmDomain = u.nElmDomain;
-  nBlk = u.nBlk;
-  nBlkDomain = u.nBlkDomain;
-  nVar = u.nVar;
-
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] = u.vec_val[i];
+  assert(nElm == u.nElm && "Sizes do not match");
 
-#ifdef HAVE_MPI
-  nElmGlobal = u.nElmGlobal;
-#endif
-
-  return *this;
-}
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++) vec_val[i] = u.vec_val[i];
 
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator=(const ScalarType & val) {
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] = val;
   return *this;
 }
 
 template<class ScalarType>
-CSysVector<ScalarType> CSysVector<ScalarType>::operator+(const CSysVector<ScalarType> & u) const {
+CSysVector<ScalarType> & CSysVector<ScalarType>::operator=(ScalarType val) {
 
-  /*--- Use copy constructor and compound addition-assignment ---*/
-  CSysVector<ScalarType> sum(*this);
-  sum += u;
-  return sum;
-}
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++) vec_val[i] = val;
 
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator+=(const CSysVector<ScalarType> & u) {
-
-  /*--- Check for consistent sizes, then add elements ---*/
-  if (nElm != u.nElm) {
-    SU2_MPI::Error("Sizes do not match", CURRENT_FUNCTION);
-  }
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] += u.vec_val[i];
   return *this;
 }
 
 template<class ScalarType>
-CSysVector<ScalarType> CSysVector<ScalarType>::operator-(const CSysVector<ScalarType> & u) const {
+CSysVector<ScalarType> & CSysVector<ScalarType>::operator+=(const CSysVector<ScalarType> & u) {
 
-  /*--- Use copy constructor and compound subtraction-assignment ---*/
-  CSysVector<ScalarType> diff(*this);
-  diff -= u;
-  return diff;
-}
+  assert(nElm == u.nElm && "Sizes do not match");
 
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator-=(const CSysVector<ScalarType> & u) {
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++) vec_val[i] += u.vec_val[i];
 
-  /*--- Check for consistent sizes, then subtract elements ---*/
-  if (nElm != u.nElm) {
-    SU2_MPI::Error("Sizes do not match", CURRENT_FUNCTION);
-  }
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] -= u.vec_val[i];
   return *this;
 }
 
 template<class ScalarType>
-CSysVector<ScalarType> CSysVector<ScalarType>::operator*(const ScalarType & val) const {
-
-  /*--- use copy constructor and compound scalar
-   multiplication-assignment ---*/
-  CSysVector<ScalarType> prod(*this);
-  prod *= val;
-  return prod;
-}
-
-template<class ScalarType>
-CSysVector<ScalarType> operator*(const ScalarType & val, const CSysVector<ScalarType> & u) {
+CSysVector<ScalarType> & CSysVector<ScalarType>::operator-=(const CSysVector<ScalarType> & u) {
 
-  /*--- use copy constructor and compound scalar
-   multiplication-assignment ---*/
-  CSysVector<ScalarType> prod(u);
-  prod *= val;
-  return prod;
-}
+  assert(nElm == u.nElm && "Sizes do not match");
 
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator*=(const ScalarType & val) {
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++) vec_val[i] -= u.vec_val[i];
 
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] *= val;
   return *this;
 }
 
 template<class ScalarType>
-CSysVector<ScalarType> CSysVector<ScalarType>::operator/(const ScalarType & val) const {
+CSysVector<ScalarType> & CSysVector<ScalarType>::operator*=(ScalarType val) {
 
-  /*--- use copy constructor and compound scalar
-   division-assignment ---*/
-  CSysVector quotient(*this);
-  quotient /= val;
-  return quotient;
-}
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++) vec_val[i] *= val;
 
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator/=(const ScalarType & val) {
-
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] /= val;
   return *this;
 }
 
 template<class ScalarType>
-ScalarType CSysVector<ScalarType>::norm() const {
-
-  /*--- just call dotProd on this*, then sqrt ---*/
-  ScalarType val = dotProd(*this, *this);
-  if (val < 0.0) {
-    SU2_MPI::Error("Inner product of CSysVector is negative", CURRENT_FUNCTION);
-  }
-  return sqrt(val);
-}
-
-template<class ScalarType>
-void CSysVector<ScalarType>::CopyToArray(ScalarType* u_array) {
-
-  for (unsigned long i = 0; i < nElm; i++)
-    u_array[i] = vec_val[i];
-}
-
-template<class ScalarType>
-void CSysVector<ScalarType>::AddBlock(unsigned long val_ipoint, ScalarType *val_residual) {
-  unsigned short iVar;
-
-  for (iVar = 0; iVar < nVar; iVar++)
-    vec_val[val_ipoint*nVar+iVar] += val_residual[iVar];
-}
-
-template<class ScalarType>
-void CSysVector<ScalarType>::SubtractBlock(unsigned long val_ipoint, ScalarType *val_residual) {
-  unsigned short iVar;
-
-  for (iVar = 0; iVar < nVar; iVar++)
-    vec_val[val_ipoint*nVar+iVar] -= val_residual[iVar];
-}
-
-template<class ScalarType>
-void CSysVector<ScalarType>::SetBlock(unsigned long val_ipoint, ScalarType *val_residual) {
-  unsigned short iVar;
-
-  for (iVar = 0; iVar < nVar; iVar++)
-    vec_val[val_ipoint*nVar+iVar] = val_residual[iVar];
-}
-
-template<class ScalarType>
-void CSysVector<ScalarType>::SetBlock(unsigned long val_ipoint, unsigned short val_var, ScalarType val_residual) {
-
-  vec_val[val_ipoint*nVar+val_var] = val_residual;
-}
-
-template<class ScalarType>
-void CSysVector<ScalarType>::SetBlock_Zero(unsigned long val_ipoint) {
-  unsigned short iVar;
+CSysVector<ScalarType> & CSysVector<ScalarType>::operator/=(ScalarType val) {
 
-  for (iVar = 0; iVar < nVar; iVar++)
-    vec_val[val_ipoint*nVar+iVar] = 0.0;
-}
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++) vec_val[i] /= val;
 
-template<class ScalarType>
-void CSysVector<ScalarType>::SetBlock_Zero(unsigned long val_ipoint, unsigned short val_var) {
-    vec_val[val_ipoint*nVar+val_var] = 0.0;
+  return *this;
 }
 
 template<class ScalarType>
-ScalarType CSysVector<ScalarType>::GetBlock(unsigned long val_ipoint, unsigned short val_var) {
-  return vec_val[val_ipoint*nVar + val_var];
-}
+void CSysVector<ScalarType>::CopyToArray(ScalarType* u_array) const {
 
-template<class ScalarType>
-ScalarType *CSysVector<ScalarType>::GetBlock(unsigned long val_ipoint) {
-  return &vec_val[val_ipoint*nVar];
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElm; i++) u_array[i] = vec_val[i];
 }
 
 template<class ScalarType>
-template<class T>
-void CSysVector<ScalarType>::PassiveCopy(const CSysVector<T>& other) {
+ScalarType CSysVector<ScalarType>::dot(const CSysVector<ScalarType> & u) const {
+#if !defined(CODI_FORWARD_TYPE) && !defined(CODI_REVERSE_TYPE)
 
-  /*--- This is a method and not the overload of an operator to make sure who
-   calls it knows the consequence to the derivative information (lost) ---*/
+  /*--- All threads get the same "view" of the vectors and shared variable. ---*/
+  SU2_OMP_BARRIER
+  dotRes = 0.0;
+  SU2_OMP_BARRIER
 
-  /*--- check if self-assignment, otherwise perform deep copy ---*/
-  if ((const void*)this == (const void*)&other) return;
+  /*--- Reduction over all threads in this mpi rank using the shared variable. ---*/
+  ScalarType sum = 0.0;
 
-  /*--- determine if (re-)allocation is needed ---*/
-  if (nElm != other.GetLocSize() && vec_val != NULL) {
-    delete [] vec_val;
-    vec_val = NULL;
-  }
+  PARALLEL_FOR
+  for(auto i=0ul; i<nElmDomain; ++i)
+    sum += vec_val[i]*u.vec_val[i];
 
-  /*--- copy ---*/
-  nElm = other.GetLocSize();
-  nElmDomain = other.GetNElmDomain();
-  nBlk = other.GetNBlk();
-  nBlkDomain = other.GetNBlkDomain();
-  nVar = other.GetNVar();
+  SU2_OMP(atomic)
+  dotRes += sum;
 
-  if (vec_val == NULL)
-    vec_val = new ScalarType[nElm];
-
-  for (unsigned long i = 0; i < nElm; i++)
-    vec_val[i] = SU2_TYPE::GetValue(other[i]);
+  /*--- Wait for all atomic updates. ---*/
+  SU2_OMP_BARRIER
 
 #ifdef HAVE_MPI
-  nElmGlobal = other.GetSize();
-#endif
-}
-
-template<class ScalarType>
-ScalarType dotProd(const CSysVector<ScalarType> & u, const CSysVector<ScalarType> & v) {
-
-  /*--- check for consistent sizes ---*/
-  if (u.nElm != v.nElm) {
-    SU2_MPI::Error("Sizes do not match", CURRENT_FUNCTION);
+  /*--- Reduce across all mpi ranks, only master thread communicates. ---*/
+  SU2_OMP_MASTER
+  {
+    sum = dotRes;
+    SelectMPIWrapper<ScalarType>::W::Allreduce(&sum, &dotRes, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   }
-
-  /*--- find local inner product and, if a parallel run, sum over all
-   processors (we use nElemDomain instead of nElem) ---*/
-  ScalarType loc_prod = 0.0;
-  for (unsigned long i = 0; i < u.nElmDomain; i++)
-    loc_prod += u.vec_val[i]*v.vec_val[i];
-  ScalarType prod = 0.0;
-
+  /*--- Make view of result consistent across threads. ---*/
+  SU2_OMP_BARRIER
+#endif // MPI
+#else // CODI_TYPE
+  /*--- Compatible version, no OMP reductions, no atomics, master does everything. ---*/
+  SU2_OMP_BARRIER
+  SU2_OMP_MASTER
+  {
+    ScalarType sum = 0.0;
+    for(auto i=0ul; i<nElmDomain; ++i)
+      sum += vec_val[i]*u.vec_val[i];
 #ifdef HAVE_MPI
-  SelectMPIWrapper<ScalarType>::W::Allreduce(&loc_prod, &prod, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    /*--- Reduce across all mpi ranks. ---*/
+    SelectMPIWrapper<ScalarType>::W::Allreduce(&sum, &dotRes, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #else
-  prod = loc_prod;
-#endif
-
-  return prod;
+    dotRes = sum;
+#endif // MPI
+  }
+  SU2_OMP_BARRIER
+#endif // CODI
+  return dotRes;
 }
 
 /*--- Explicit instantiations ---*/
 template class CSysVector<su2double>;
-template CSysVector<su2double> operator*(const su2double&, const CSysVector<su2double>&);
 template void CSysVector<su2double>::PassiveCopy(const CSysVector<su2double>&);
-template su2double dotProd<su2double>(const CSysVector<su2double> & u, const CSysVector<su2double> & v);
-
-template class CSysVector<unsigned long>;
 
 #ifdef CODI_REVERSE_TYPE
 template class CSysVector<passivedouble>;
-template CSysVector<passivedouble> operator*(const passivedouble&, const CSysVector<passivedouble>&);
 template void CSysVector<su2double>::PassiveCopy(const CSysVector<passivedouble>&);
 template void CSysVector<passivedouble>::PassiveCopy(const CSysVector<su2double>&);
-template passivedouble dotProd<passivedouble>(const CSysVector<passivedouble> & u, const CSysVector<passivedouble> & v);
 #endif
diff --git a/SU2_CFD/include/SU2_CFD.hpp b/SU2_CFD/include/SU2_CFD.hpp
index 142649fb187b..57ce99b0d8db 100644
--- a/SU2_CFD/include/SU2_CFD.hpp
+++ b/SU2_CFD/include/SU2_CFD.hpp
@@ -7,7 +7,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2019, SU2 Contributors (cf. AUTHORS.md)
@@ -28,6 +28,7 @@
 #pragma once
 
 #include "../../Common/include/mpi_structure.hpp"
+#include "../../Common/include/omp_structure.hpp"
 #include "CLI11.hpp"
 
 #include <ctime>
diff --git a/SU2_CFD/include/solver_structure.hpp b/SU2_CFD/include/solver_structure.hpp
index f57c8e048498..233628fd6fa4 100644
--- a/SU2_CFD/include/solver_structure.hpp
+++ b/SU2_CFD/include/solver_structure.hpp
@@ -3734,15 +3734,6 @@ class CSolver {
    */
   virtual su2double Get_val_I(void);
   
-  /*!
-   * \brief A virtual member.
-   * \param[in] iPoint - Point i of the Mass Matrix.
-   * \param[in] jPoint - Point j of the Mass Matrix.
-   * \param[in] iVar - Variable i of the Mass Matrix submatrix.
-   * \param[in] iVar - Variable j of the Mass Matrix submatrix.
-   */
-  virtual su2double Get_MassMatrix(unsigned long iPoint, unsigned long jPoint, unsigned short iVar, unsigned short jVar);
-  
   /*!
    * \brief Gauss method for solving a linear system.
    * \param[in] A - Matrix Ax = b.
@@ -12222,15 +12213,6 @@ class CFEASolver : public CSolver {
    */
   unsigned short Get_iElem_iDe(unsigned long iElem);
   
-  /*!
-   * \brief Retrieve the Mass Matrix term (to add to the Jacobian of the adjoint problem)
-   * \param[in] iPoint - Point i of the Mass Matrix.
-   * \param[in] jPoint - Point j of the Mass Matrix.
-   * \param[in] iVar - Variable i of the Mass Matrix submatrix.
-   * \param[in] iVar - Variable j of the Mass Matrix submatrix.
-   */
-  su2double Get_MassMatrix(unsigned long iPoint, unsigned long jPoint, unsigned short iVar, unsigned short jVar);
-  
   /*!
    * \brief Load a solution from a restart file.
    * \param[in] geometry - Geometrical definition of the problem.
diff --git a/SU2_CFD/include/solver_structure.inl b/SU2_CFD/include/solver_structure.inl
index 9a8fc6ddd939..763826b86832 100644
--- a/SU2_CFD/include/solver_structure.inl
+++ b/SU2_CFD/include/solver_structure.inl
@@ -1961,8 +1961,6 @@ inline su2double CSolver::Get_DV_Val(unsigned short i_DV){ return 0.0; }
 
 inline su2double CSolver::Get_val_I(void){ return 0.0; }
 
-inline su2double CSolver::Get_MassMatrix(unsigned long iPoint, unsigned long jPoint, unsigned short iVar, unsigned short jVar){ return 0.0; }
-
 inline su2double CIncEulerSolver::GetDensity_Inf(void) { return Density_Inf; }
 
 inline su2double CIncEulerSolver::GetModVelocity_Inf(void) {
@@ -2282,9 +2280,6 @@ inline su2double CHeatSolverFVM::GetConjugateHeatVariable(unsigned short val_mar
 inline void CHeatSolverFVM::SetConjugateHeatVariable(unsigned short val_marker, unsigned long val_vertex, unsigned short pos_var, su2double relaxation_factor, su2double val_var) {
   ConjugateVar[val_marker][val_vertex][pos_var] = relaxation_factor*val_var + (1.0-relaxation_factor)*ConjugateVar[val_marker][val_vertex][pos_var]; }
 
-inline su2double CFEASolver::Get_MassMatrix(unsigned long iPoint, unsigned long jPoint, unsigned short iVar, unsigned short jVar){ 
-  return MassMatrix.GetBlock(iPoint, jPoint, iVar, jVar); }
-
 inline unsigned short CFEASolver::Get_iElem_iDe(unsigned long iElem){ return iElem_iDe[iElem]; }
 
 inline su2double CFEASolver::GetRes_FEM(unsigned short val_var) { return Conv_Check[val_var]; }
diff --git a/SU2_CFD/src/SU2_CFD.cpp b/SU2_CFD/src/SU2_CFD.cpp
index cfee50ffb5cb..dcefc788e553 100644
--- a/SU2_CFD/src/SU2_CFD.cpp
+++ b/SU2_CFD/src/SU2_CFD.cpp
@@ -60,7 +60,12 @@ int main(int argc, char *argv[]) {
 #ifdef HAVE_MPI
   int  buffsize;
   char *buffptr;
+#ifdef HAVE_OMP
+  int provided;
+  SU2_MPI::Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+#else
   SU2_MPI::Init(&argc, &argv);
+#endif
   SU2_MPI::Buffer_attach( malloc(BUFSIZE), BUFSIZE );
   SU2_Comm MPICommunicator(MPI_COMM_WORLD);
 #else
diff --git a/SU2_CFD/src/output/CElasticityOutput.cpp b/SU2_CFD/src/output/CElasticityOutput.cpp
index 98080abbc820..d8c5db7b9956 100644
--- a/SU2_CFD/src/output/CElasticityOutput.cpp
+++ b/SU2_CFD/src/output/CElasticityOutput.cpp
@@ -117,7 +117,7 @@ void CElasticityOutput::LoadHistoryData(CConfig *config, CGeometry *geometry, CS
   } else if (nonlinear_analysis){
     SetHistoryOutputValue("RMS_UTOL", log10(fea_solver->LinSysSol.norm()));
     SetHistoryOutputValue("RMS_RTOL", log10(fea_solver->LinSysRes.norm()));
-    SetHistoryOutputValue("RMS_ETOL", log10(dotProd(fea_solver->LinSysSol, fea_solver->LinSysRes)));
+    SetHistoryOutputValue("RMS_ETOL", log10(fea_solver->LinSysSol.dot(fea_solver->LinSysRes)));
 
   }
 
diff --git a/SU2_CFD/src/solver_direct_elasticity.cpp b/SU2_CFD/src/solver_direct_elasticity.cpp
index 9cff906dcf9b..3274549b534b 100644
--- a/SU2_CFD/src/solver_direct_elasticity.cpp
+++ b/SU2_CFD/src/solver_direct_elasticity.cpp
@@ -2139,7 +2139,7 @@ void CFEASolver::Postprocessing(CGeometry *geometry, CSolver **solver_container,
 
       Conv_Check[0] = LinSysSol.norm();               // Norm of the delta-solution vector
       Conv_Check[1] = LinSysRes.norm();               // Norm of the residual
-      Conv_Check[2] = dotProd(LinSysSol, LinSysRes);  // Position for the energy tolerance
+      Conv_Check[2] = LinSysSol.dot(LinSysRes);       // Position for the energy tolerance
 
       /*--- MPI solution ---*/
 
@@ -3024,8 +3024,8 @@ void CFEASolver::ImplicitEuler_Iteration(CGeometry *geometry, CSolver **solver_c
 
 void CFEASolver::ImplicitNewmark_Iteration(CGeometry *geometry, CSolver **solver_container, CConfig *config) {
 
-  unsigned long iPoint, jPoint;
-  unsigned short iVar, jVar;
+  unsigned long iPoint;
+  unsigned short iVar;
 
   bool first_iter = (config->GetInnerIter() == 0);
   bool dynamic = (config->GetTime_Domain());              // Dynamic simulations.
@@ -3099,16 +3099,7 @@ void CFEASolver::ImplicitNewmark_Iteration(CGeometry *geometry, CSolver **solver
      *
      */
     if ((nonlinear_analysis && (newton_raphson || first_iter)) || linear_analysis) {
-      for (iPoint = 0; iPoint < nPoint; iPoint++) {
-        for (jPoint = 0; jPoint < nPoint; jPoint++) {
-          for(iVar = 0; iVar < nVar; iVar++) {
-            for (jVar = 0; jVar < nVar; jVar++) {
-              Jacobian_ij[iVar][jVar] = a_dt[0] * MassMatrix.GetBlock(iPoint, jPoint, iVar, jVar);
-            }
-          }
-          Jacobian.AddBlock(iPoint, jPoint, Jacobian_ij);
-        }
-      }
+      Jacobian.MatrixMatrixAddition(a_dt[0], MassMatrix);
     }
 
 
@@ -3330,8 +3321,8 @@ void CFEASolver::ImplicitNewmark_Relaxation(CGeometry *geometry, CSolver **solve
 
 void CFEASolver::GeneralizedAlpha_Iteration(CGeometry *geometry, CSolver **solver_container, CConfig *config) {
 
-  unsigned long iPoint, jPoint;
-  unsigned short iVar, jVar;
+  unsigned long iPoint;
+  unsigned short iVar;
 
   bool first_iter = (config->GetInnerIter() == 0);
   bool dynamic = (config->GetTime_Domain());              // Dynamic simulations.
@@ -3398,16 +3389,7 @@ void CFEASolver::GeneralizedAlpha_Iteration(CGeometry *geometry, CSolver **solve
      *
      */
     if ((nonlinear_analysis && (newton_raphson || first_iter)) || linear_analysis) {
-      for (iPoint = 0; iPoint < nPoint; iPoint++) {
-        for (jPoint = 0; jPoint < nPoint; jPoint++) {
-          for(iVar = 0; iVar < nVar; iVar++) {
-            for (jVar = 0; jVar < nVar; jVar++) {
-              Jacobian_ij[iVar][jVar] = a_dt[0] * MassMatrix.GetBlock(iPoint, jPoint, iVar, jVar);
-            }
-          }
-          Jacobian.AddBlock(iPoint, jPoint, Jacobian_ij);
-        }
-      }
+      Jacobian.MatrixMatrixAddition(a_dt[0], MassMatrix);
     }
 
 
diff --git a/SU2_CFD/src/solver_direct_heat.cpp b/SU2_CFD/src/solver_direct_heat.cpp
index 550dec6e477d..c11890658dba 100644
--- a/SU2_CFD/src/solver_direct_heat.cpp
+++ b/SU2_CFD/src/solver_direct_heat.cpp
@@ -520,38 +520,35 @@ void CHeatSolverFVM::Centered_Residual(CGeometry *geometry, CSolver **solver_con
 
     nVarFlow = solver_container[FLOW_SOL]->GetnVar();
 
-      for (iEdge = 0; iEdge < geometry->GetnEdge(); iEdge++) {
+    for (iEdge = 0; iEdge < geometry->GetnEdge(); iEdge++) {
 
-        /*--- Points in edge ---*/
-        iPoint = geometry->edge[iEdge]->GetNode(0);
-        jPoint = geometry->edge[iEdge]->GetNode(1);
-        numerics->SetNormal(geometry->edge[iEdge]->GetNormal());
+      /*--- Points in edge ---*/
+      iPoint = geometry->edge[iEdge]->GetNode(0);
+      jPoint = geometry->edge[iEdge]->GetNode(1);
+      numerics->SetNormal(geometry->edge[iEdge]->GetNormal());
 
-        /*--- Primitive variables w/o reconstruction ---*/
-        V_i = solver_container[FLOW_SOL]->GetNodes()->GetPrimitive(iPoint);
-        V_j = solver_container[FLOW_SOL]->GetNodes()->GetPrimitive(jPoint);
+      /*--- Primitive variables w/o reconstruction ---*/
+      V_i = solver_container[FLOW_SOL]->GetNodes()->GetPrimitive(iPoint);
+      V_j = solver_container[FLOW_SOL]->GetNodes()->GetPrimitive(jPoint);
 
-        Temp_i = nodes->GetSolution(iPoint,0);
-        Temp_j = nodes->GetSolution(jPoint,0);
+      Temp_i = nodes->GetSolution(iPoint,0);
+      Temp_j = nodes->GetSolution(jPoint,0);
 
-        numerics->SetUndivided_Laplacian(nodes->GetUndivided_Laplacian(iPoint), nodes->GetUndivided_Laplacian(jPoint));
-        numerics->SetNeighbor(geometry->node[iPoint]->GetnNeighbor(), geometry->node[jPoint]->GetnNeighbor());
+      numerics->SetUndivided_Laplacian(nodes->GetUndivided_Laplacian(iPoint), nodes->GetUndivided_Laplacian(jPoint));
+      numerics->SetNeighbor(geometry->node[iPoint]->GetnNeighbor(), geometry->node[jPoint]->GetnNeighbor());
 
-        numerics->SetPrimitive(V_i, V_j);
-        numerics->SetTemperature(Temp_i, Temp_j);
+      numerics->SetPrimitive(V_i, V_j);
+      numerics->SetTemperature(Temp_i, Temp_j);
 
-        numerics->ComputeResidual(Residual, Jacobian_i, Jacobian_j, config);
+      numerics->ComputeResidual(Residual, Jacobian_i, Jacobian_j, config);
 
-        LinSysRes.AddBlock(iPoint, Residual);
-        LinSysRes.SubtractBlock(jPoint, Residual);
+      LinSysRes.AddBlock(iPoint, Residual);
+      LinSysRes.SubtractBlock(jPoint, Residual);
 
-        /*--- Implicit part ---*/
+      /*--- Implicit part ---*/
 
-        Jacobian.AddBlock(iPoint, iPoint, Jacobian_i);
-        Jacobian.AddBlock(iPoint, jPoint, Jacobian_j);
-        Jacobian.SubtractBlock(jPoint, iPoint, Jacobian_i);
-        Jacobian.SubtractBlock(jPoint, jPoint, Jacobian_j);
-      }
+      Jacobian.UpdateBlocks(iEdge, iPoint, jPoint, Jacobian_i, Jacobian_j);
+    }
   }
 }
 
@@ -571,83 +568,80 @@ void CHeatSolverFVM::Upwind_Residual(CGeometry *geometry, CSolver **solver_conta
 
     nVarFlow = solver_container[FLOW_SOL]->GetnVar();
 
-      for (iEdge = 0; iEdge < geometry->GetnEdge(); iEdge++) {
+    for (iEdge = 0; iEdge < geometry->GetnEdge(); iEdge++) {
 
-        /*--- Points in edge ---*/
-        iPoint = geometry->edge[iEdge]->GetNode(0);
-        jPoint = geometry->edge[iEdge]->GetNode(1);
-        numerics->SetNormal(geometry->edge[iEdge]->GetNormal());
+      /*--- Points in edge ---*/
+      iPoint = geometry->edge[iEdge]->GetNode(0);
+      jPoint = geometry->edge[iEdge]->GetNode(1);
+      numerics->SetNormal(geometry->edge[iEdge]->GetNormal());
 
-        /*--- Primitive variables w/o reconstruction ---*/
-        V_i = solver_container[FLOW_SOL]->GetNodes()->GetPrimitive(iPoint);
-        V_j = solver_container[FLOW_SOL]->GetNodes()->GetPrimitive(jPoint);
+      /*--- Primitive variables w/o reconstruction ---*/
+      V_i = solver_container[FLOW_SOL]->GetNodes()->GetPrimitive(iPoint);
+      V_j = solver_container[FLOW_SOL]->GetNodes()->GetPrimitive(jPoint);
 
-        Temp_i_Grad = nodes->GetGradient(iPoint);
-        Temp_j_Grad = nodes->GetGradient(jPoint);
-        numerics->SetConsVarGradient(Temp_i_Grad, Temp_j_Grad);
+      Temp_i_Grad = nodes->GetGradient(iPoint);
+      Temp_j_Grad = nodes->GetGradient(jPoint);
+      numerics->SetConsVarGradient(Temp_i_Grad, Temp_j_Grad);
 
-        Temp_i = nodes->GetSolution(iPoint,0);
-        Temp_j = nodes->GetSolution(jPoint,0);
+      Temp_i = nodes->GetSolution(iPoint,0);
+      Temp_j = nodes->GetSolution(jPoint,0);
 
-        /* Second order reconstruction */
-        if (muscl) {
+      /* Second order reconstruction */
+      if (muscl) {
 
-            for (iDim = 0; iDim < nDim; iDim++) {
-              Vector_i[iDim] = 0.5*(geometry->node[jPoint]->GetCoord(iDim) - geometry->node[iPoint]->GetCoord(iDim));
-              Vector_j[iDim] = 0.5*(geometry->node[iPoint]->GetCoord(iDim) - geometry->node[jPoint]->GetCoord(iDim));
-            }
+        for (iDim = 0; iDim < nDim; iDim++) {
+          Vector_i[iDim] = 0.5*(geometry->node[jPoint]->GetCoord(iDim) - geometry->node[iPoint]->GetCoord(iDim));
+          Vector_j[iDim] = 0.5*(geometry->node[iPoint]->GetCoord(iDim) - geometry->node[jPoint]->GetCoord(iDim));
+        }
 
-            Gradient_i = solver_container[FLOW_SOL]->GetNodes()->GetGradient_Reconstruction(iPoint);
-            Gradient_j = solver_container[FLOW_SOL]->GetNodes()->GetGradient_Reconstruction(jPoint);
-            Temp_i_Grad = nodes->GetGradient_Reconstruction(iPoint);
-            Temp_j_Grad = nodes->GetGradient_Reconstruction(jPoint);
+        Gradient_i = solver_container[FLOW_SOL]->GetNodes()->GetGradient_Reconstruction(iPoint);
+        Gradient_j = solver_container[FLOW_SOL]->GetNodes()->GetGradient_Reconstruction(jPoint);
+        Temp_i_Grad = nodes->GetGradient_Reconstruction(iPoint);
+        Temp_j_Grad = nodes->GetGradient_Reconstruction(jPoint);
 
-            /*Loop to correct the flow variables*/
-            for (iVar = 0; iVar < nVarFlow; iVar++) {
+        /*Loop to correct the flow variables*/
+        for (iVar = 0; iVar < nVarFlow; iVar++) {
 
-              /*Apply the Gradient to get the right temperature value on the edge */
-              Project_Grad_i = 0.0; Project_Grad_j = 0.0;
-              for (iDim = 0; iDim < nDim; iDim++) {
-                  Project_Grad_i += Vector_i[iDim]*Gradient_i[iVar][iDim];
-                  Project_Grad_j += Vector_j[iDim]*Gradient_j[iVar][iDim];
-              }
+          /*Apply the Gradient to get the right temperature value on the edge */
+          Project_Grad_i = 0.0; Project_Grad_j = 0.0;
+          for (iDim = 0; iDim < nDim; iDim++) {
+              Project_Grad_i += Vector_i[iDim]*Gradient_i[iVar][iDim];
+              Project_Grad_j += Vector_j[iDim]*Gradient_j[iVar][iDim];
+          }
 
-              Primitive_Flow_i[iVar] = V_i[iVar] + Project_Grad_i;
-              Primitive_Flow_j[iVar] = V_j[iVar] + Project_Grad_j;
-            }
+          Primitive_Flow_i[iVar] = V_i[iVar] + Project_Grad_i;
+          Primitive_Flow_j[iVar] = V_j[iVar] + Project_Grad_j;
+        }
 
-            /* Correct the temperature variables */
-            Project_Temp_i_Grad = 0.0; Project_Temp_j_Grad = 0.0;
-            for (iDim = 0; iDim < nDim; iDim++) {
-                Project_Temp_i_Grad += Vector_i[iDim]*Temp_i_Grad[0][iDim];
-                Project_Temp_j_Grad += Vector_j[iDim]*Temp_j_Grad[0][iDim];
-            }
+        /* Correct the temperature variables */
+        Project_Temp_i_Grad = 0.0; Project_Temp_j_Grad = 0.0;
+        for (iDim = 0; iDim < nDim; iDim++) {
+            Project_Temp_i_Grad += Vector_i[iDim]*Temp_i_Grad[0][iDim];
+            Project_Temp_j_Grad += Vector_j[iDim]*Temp_j_Grad[0][iDim];
+        }
 
-            Temp_i_Corrected = Temp_i + Project_Temp_i_Grad;
-            Temp_j_Corrected = Temp_j + Project_Temp_j_Grad;
+        Temp_i_Corrected = Temp_i + Project_Temp_i_Grad;
+        Temp_j_Corrected = Temp_j + Project_Temp_j_Grad;
 
-            numerics->SetPrimitive(Primitive_Flow_i, Primitive_Flow_j);
-            numerics->SetTemperature(Temp_i_Corrected, Temp_j_Corrected);
-        }
+        numerics->SetPrimitive(Primitive_Flow_i, Primitive_Flow_j);
+        numerics->SetTemperature(Temp_i_Corrected, Temp_j_Corrected);
+      }
 
-        else {
+      else {
 
-          numerics->SetPrimitive(V_i, V_j);
-          numerics->SetTemperature(Temp_i, Temp_j);
-        }
+        numerics->SetPrimitive(V_i, V_j);
+        numerics->SetTemperature(Temp_i, Temp_j);
+      }
 
-        numerics->ComputeResidual(Residual, Jacobian_i, Jacobian_j, config);
+      numerics->ComputeResidual(Residual, Jacobian_i, Jacobian_j, config);
 
-        LinSysRes.AddBlock(iPoint, Residual);
-        LinSysRes.SubtractBlock(jPoint, Residual);
+      LinSysRes.AddBlock(iPoint, Residual);
+      LinSysRes.SubtractBlock(jPoint, Residual);
 
-        /*--- Implicit part ---*/
+      /*--- Implicit part ---*/
 
-        Jacobian.AddBlock(iPoint, iPoint, Jacobian_i);
-        Jacobian.AddBlock(iPoint, jPoint, Jacobian_j);
-        Jacobian.SubtractBlock(jPoint, iPoint, Jacobian_i);
-        Jacobian.SubtractBlock(jPoint, jPoint, Jacobian_j);
-        }
+      Jacobian.UpdateBlocks(iEdge, iPoint, jPoint, Jacobian_i, Jacobian_j);
+    }
   }
 
 }
diff --git a/SU2_CFD/src/solver_direct_mean.cpp b/SU2_CFD/src/solver_direct_mean.cpp
index 0a236239dfe1..9ca16fd27b36 100644
--- a/SU2_CFD/src/solver_direct_mean.cpp
+++ b/SU2_CFD/src/solver_direct_mean.cpp
@@ -3333,13 +3333,10 @@ void CEulerSolver::Centered_Residual(CGeometry *geometry, CSolver **solver_conta
     
     /*--- Set implicit computation ---*/
     if (implicit) {
-      Jacobian.AddBlock(iPoint, iPoint, Jacobian_i);
-      Jacobian.AddBlock(iPoint, jPoint, Jacobian_j);
-      Jacobian.SubtractBlock(jPoint, iPoint, Jacobian_i);
-      Jacobian.SubtractBlock(jPoint, jPoint, Jacobian_j);
+      Jacobian.UpdateBlocks(iEdge, iPoint, jPoint, Jacobian_i, Jacobian_j);
     }
   }
-  
+
 }
 
 void CEulerSolver::Upwind_Residual(CGeometry *geometry, CSolver **solver_container, CNumerics *numerics,
@@ -3579,10 +3576,7 @@ void CEulerSolver::Upwind_Residual(CGeometry *geometry, CSolver **solver_contain
     /*--- Set implicit Jacobians ---*/
     
     if (implicit) {
-      Jacobian.AddBlock(iPoint, iPoint, Jacobian_i);
-      Jacobian.AddBlock(iPoint, jPoint, Jacobian_j);
-      Jacobian.SubtractBlock(jPoint, iPoint, Jacobian_i);
-      Jacobian.SubtractBlock(jPoint, jPoint, Jacobian_j);
+      Jacobian.UpdateBlocks(iEdge, iPoint, jPoint, Jacobian_i, Jacobian_j);
     }
 
     /*--- Set the final value of the Roe dissipation coefficient ---*/
@@ -8948,8 +8942,7 @@ void CEulerSolver::BC_Riemann(CGeometry *geometry, CSolver **solver_container,
 void CEulerSolver::BC_TurboRiemann(CGeometry *geometry, CSolver **solver_container,
     CNumerics *conv_numerics, CNumerics *visc_numerics, CConfig *config, unsigned short val_marker) {
   unsigned short iDim, iVar, jVar, kVar, iSpan;
-  unsigned long iPoint, Point_Normal, oldVertex;
-  long iVertex;
+  unsigned long iPoint, Point_Normal, oldVertex, iVertex;
   su2double P_Total, T_Total, *Flow_Dir;
   su2double *Velocity_b, Velocity2_b, Enthalpy_b, Energy_b, StaticEnergy_b, Density_b, Kappa_b, Chi_b, Pressure_b, Temperature_b;
   su2double *Velocity_e, Velocity2_e, Enthalpy_e, Entropy_e, Energy_e = 0.0, StaticEnthalpy_e, StaticEnergy_e, Density_e = 0.0, Pressure_e;
@@ -8995,7 +8988,7 @@ void CEulerSolver::BC_TurboRiemann(CGeometry *geometry, CSolver **solver_contain
 
   /*--- Loop over all the vertices on this boundary marker ---*/
   for (iSpan= 0; iSpan < nSpanWiseSections; iSpan++){
-    for (iVertex = 0; iVertex < geometry->nVertexSpan[val_marker][iSpan]; iVertex++) {
+    for (iVertex = 0; iVertex < geometry->GetnVertexSpan(val_marker,iSpan); iVertex++) {
 
       /*--- using the other vertex information for retrieving some information ---*/
       oldVertex = geometry->turbovertex[val_marker][iSpan][iVertex]->GetOldVertex();
@@ -9454,8 +9447,8 @@ void CEulerSolver::PreprocessBC_Giles(CGeometry *geometry, CConfig *config, CNum
   su2double cj_inf,cj_out1, cj_out2, Density_i, Pressure_i, *turboNormal, *turboVelocity, *Velocity_i, AverageSoundSpeed;
   su2double *deltaprim, *cj, TwoPiThetaFreq_Pitch, pitch, theta, deltaTheta;
   unsigned short iMarker, iSpan, iMarkerTP, iDim;
-  unsigned long  iPoint, kend_max, k;
-  long iVertex, freq;
+  unsigned long  iPoint, kend_max, k, iVertex;
+  long freq;
   unsigned short  iZone     = config->GetiZone();
   unsigned short nSpanWiseSections = geometry->GetnSpanWiseSections(marker_flag);
   turboNormal 	= new su2double[nDim];
@@ -9481,7 +9474,7 @@ void CEulerSolver::PreprocessBC_Giles(CGeometry *geometry, CConfig *config, CNum
         for (iMarkerTP=1; iMarkerTP < config->GetnMarker_Turbomachinery()+1; iMarkerTP++){
           if (config->GetMarker_All_Turbomachinery(iMarker) == iMarkerTP){
             if (config->GetMarker_All_TurbomachineryFlag(iMarker) == marker_flag){
-              for (iVertex = 0; iVertex < geometry->nVertexSpan[iMarker][iSpan]; iVertex++) {
+              for (iVertex = 0; iVertex < geometry->GetnVertexSpan(iMarker,iSpan); iVertex++) {
 
                 /*--- find the node related to the vertex ---*/
                 iPoint = geometry->turbovertex[iMarker][iSpan][iVertex]->GetNode();
@@ -9607,8 +9600,7 @@ void CEulerSolver::PreprocessBC_Giles(CGeometry *geometry, CConfig *config, CNum
 void CEulerSolver::BC_Giles(CGeometry *geometry, CSolver **solver_container,
     CNumerics *conv_numerics, CNumerics *visc_numerics, CConfig *config, unsigned short val_marker) {
   unsigned short iDim, iVar, jVar, iSpan;
-  unsigned long  iPoint, Point_Normal, oldVertex, k, kend, kend_max;
-  long iVertex;
+  unsigned long  iPoint, Point_Normal, oldVertex, k, kend, kend_max, iVertex;
   su2double  *UnitNormal, *turboVelocity, *turboNormal;
 
   su2double *Velocity_b, Velocity2_b, Enthalpy_b, Energy_b, Density_b, Pressure_b, Temperature_b;
@@ -9915,7 +9907,7 @@ void CEulerSolver::BC_Giles(CGeometry *geometry, CSolver **solver_container,
 
     /*--- Loop over all the vertices on this boundary marker ---*/
 
-    for (iVertex = 0; iVertex < geometry->nVertexSpan[val_marker][iSpan]; iVertex++) {
+    for (iVertex = 0; iVertex < geometry->GetnVertexSpan(val_marker,iSpan); iVertex++) {
 
       /*--- using the other vertex information for retrieving some information ---*/
       oldVertex = geometry->turbovertex[val_marker][iSpan][iVertex]->GetOldVertex();
@@ -15359,10 +15351,7 @@ void CNSSolver::Viscous_Residual(CGeometry *geometry, CSolver **solver_container
     /*--- Implicit part ---*/
     
     if (implicit) {
-      Jacobian.SubtractBlock(iPoint, iPoint, Jacobian_i);
-      Jacobian.SubtractBlock(iPoint, jPoint, Jacobian_j);
-      Jacobian.AddBlock(jPoint, iPoint, Jacobian_i);
-      Jacobian.AddBlock(jPoint, jPoint, Jacobian_j);
+      Jacobian.UpdateBlocks<su2double,-1>(iEdge, iPoint, jPoint, Jacobian_i, Jacobian_j);
     }
     
   }
diff --git a/SU2_CFD/src/solver_direct_mean_inc.cpp b/SU2_CFD/src/solver_direct_mean_inc.cpp
index 42f73a564b5e..12ff1babeee6 100644
--- a/SU2_CFD/src/solver_direct_mean_inc.cpp
+++ b/SU2_CFD/src/solver_direct_mean_inc.cpp
@@ -1859,10 +1859,7 @@ void CIncEulerSolver::Centered_Residual(CGeometry *geometry, CSolver **solver_co
     /*--- Store implicit contributions from the residual calculation. ---*/
     
     if (implicit) {
-      Jacobian.AddBlock(iPoint, iPoint, Jacobian_i);
-      Jacobian.AddBlock(iPoint, jPoint, Jacobian_j);
-      Jacobian.SubtractBlock(jPoint, iPoint, Jacobian_i);
-      Jacobian.SubtractBlock(jPoint, jPoint, Jacobian_j);
+      Jacobian.UpdateBlocks(iEdge, iPoint, jPoint, Jacobian_i, Jacobian_j);
     }
   }
   
@@ -2008,10 +2005,7 @@ void CIncEulerSolver::Upwind_Residual(CGeometry *geometry, CSolver **solver_cont
     /*--- Set implicit Jacobians ---*/
     
     if (implicit) {
-      Jacobian.AddBlock(iPoint, iPoint, Jacobian_i);
-      Jacobian.AddBlock(iPoint, jPoint, Jacobian_j);
-      Jacobian.SubtractBlock(jPoint, iPoint, Jacobian_i);
-      Jacobian.SubtractBlock(jPoint, jPoint, Jacobian_j);
+      Jacobian.UpdateBlocks(iEdge, iPoint, jPoint, Jacobian_i, Jacobian_j);
     }
   }
   
@@ -7718,10 +7712,7 @@ void CIncNSSolver::Viscous_Residual(CGeometry *geometry, CSolver **solver_contai
     /*--- Implicit part ---*/
     
     if (implicit) {
-      Jacobian.SubtractBlock(iPoint, iPoint, Jacobian_i);
-      Jacobian.SubtractBlock(iPoint, jPoint, Jacobian_j);
-      Jacobian.AddBlock(jPoint, iPoint, Jacobian_i);
-      Jacobian.AddBlock(jPoint, jPoint, Jacobian_j);
+      Jacobian.UpdateBlocks<su2double,-1>(iEdge, iPoint, jPoint, Jacobian_i, Jacobian_j);
     }
     
   }
diff --git a/SU2_CFD/src/solver_direct_turbulent.cpp b/SU2_CFD/src/solver_direct_turbulent.cpp
index 2f4d33d7c634..92e797b764a8 100644
--- a/SU2_CFD/src/solver_direct_turbulent.cpp
+++ b/SU2_CFD/src/solver_direct_turbulent.cpp
@@ -193,10 +193,7 @@ void CTurbSolver::Upwind_Residual(CGeometry *geometry, CSolver **solver_containe
     
     /*--- Implicit part ---*/
     
-    Jacobian.AddBlock(iPoint, iPoint, Jacobian_i);
-    Jacobian.AddBlock(iPoint, jPoint, Jacobian_j);
-    Jacobian.SubtractBlock(jPoint, iPoint, Jacobian_i);
-    Jacobian.SubtractBlock(jPoint, jPoint, Jacobian_j);
+    Jacobian.UpdateBlocks(iEdge, iPoint, jPoint, Jacobian_i, Jacobian_j);
     
   }
   
@@ -242,10 +239,7 @@ void CTurbSolver::Viscous_Residual(CGeometry *geometry, CSolver **solver_contain
     LinSysRes.SubtractBlock(iPoint, Residual);
     LinSysRes.AddBlock(jPoint, Residual);
     
-    Jacobian.SubtractBlock(iPoint, iPoint, Jacobian_i);
-    Jacobian.SubtractBlock(iPoint, jPoint, Jacobian_j);
-    Jacobian.AddBlock(jPoint, iPoint, Jacobian_i);
-    Jacobian.AddBlock(jPoint, jPoint, Jacobian_j);
+    Jacobian.UpdateBlocks<su2double,-1>(iEdge, iPoint, jPoint, Jacobian_i, Jacobian_j);
     
   }
   
@@ -2057,8 +2051,7 @@ void CTurbSASolver::BC_ActDisk(CGeometry *geometry, CSolver **solver_container,
 void CTurbSASolver::BC_Inlet_MixingPlane(CGeometry *geometry, CSolver **solver_container, CNumerics *conv_numerics, CNumerics *visc_numerics, CConfig *config, unsigned short val_marker) {
 
   unsigned short iDim, iSpan;
-  unsigned long  oldVertex, iPoint, Point_Normal;
-  long iVertex;
+  unsigned long  oldVertex, iPoint, Point_Normal, iVertex;
   su2double *V_inlet, *V_domain, *Normal;
   su2double extAverageNu;
   Normal = new su2double[nDim];
@@ -2072,7 +2065,7 @@ void CTurbSASolver::BC_Inlet_MixingPlane(CGeometry *geometry, CSolver **solver_c
 
     /*--- Loop over all the vertices on this boundary marker ---*/
 
-    for (iVertex = 0; iVertex < geometry->nVertexSpan[val_marker][iSpan]; iVertex++) {
+    for (iVertex = 0; iVertex < geometry->GetnVertexSpan(val_marker,iSpan); iVertex++) {
 
       /*--- find the node related to the vertex ---*/
       iPoint = geometry->turbovertex[val_marker][iSpan][iVertex]->GetNode();
@@ -2163,8 +2156,7 @@ void CTurbSASolver::BC_Inlet_MixingPlane(CGeometry *geometry, CSolver **solver_c
 void CTurbSASolver::BC_Inlet_Turbo(CGeometry *geometry, CSolver **solver_container, CNumerics *conv_numerics, CNumerics *visc_numerics, CConfig *config, unsigned short val_marker) {
 
   unsigned short iDim, iSpan;
-  unsigned long  oldVertex, iPoint, Point_Normal;
-  long iVertex;
+  unsigned long  oldVertex, iPoint, Point_Normal, iVertex;
   su2double *V_inlet, *V_domain, *Normal;
 
   su2double rho, pressure, muLam, Factor_nu_Inf, nu_tilde;
@@ -2189,7 +2181,7 @@ void CTurbSASolver::BC_Inlet_Turbo(CGeometry *geometry, CSolver **solver_contain
 
 
     /*--- Loop over all the vertices on this boundary marker ---*/
-    for (iVertex = 0; iVertex < geometry->nVertexSpan[val_marker][iSpan]; iVertex++) {
+    for (iVertex = 0; iVertex < geometry->GetnVertexSpan(val_marker,iSpan); iVertex++) {
 
       /*--- find the node related to the vertex ---*/
       iPoint = geometry->turbovertex[val_marker][iSpan][iVertex]->GetNode();
@@ -4039,8 +4031,7 @@ void CTurbSSTSolver::BC_Inlet_MixingPlane(CGeometry *geometry, CSolver **solver_
                               unsigned short val_marker) {
 
   unsigned short iVar, iSpan, iDim;
-  unsigned long  oldVertex, iPoint, Point_Normal;
-  long iVertex;
+  unsigned long  oldVertex, iPoint, Point_Normal, iVertex;
   su2double *V_inlet, *V_domain, *Normal;
   su2double extAverageKine, extAverageOmega;
   unsigned short nSpanWiseSections = config->GetnSpanWiseSections();
@@ -4057,7 +4048,7 @@ void CTurbSSTSolver::BC_Inlet_MixingPlane(CGeometry *geometry, CSolver **solver_
 
     /*--- Loop over all the vertices on this boundary marker ---*/
 
-    for (iVertex = 0; iVertex < geometry->nVertexSpan[val_marker][iSpan]; iVertex++) {
+    for (iVertex = 0; iVertex < geometry->GetnVertexSpan(val_marker,iSpan); iVertex++) {
 
       /*--- find the node related to the vertex ---*/
       iPoint = geometry->turbovertex[val_marker][iSpan][iVertex]->GetNode();
@@ -4141,8 +4132,7 @@ void CTurbSSTSolver::BC_Inlet_Turbo(CGeometry *geometry, CSolver **solver_contai
                               unsigned short val_marker) {
 
   unsigned short iVar, iSpan, iDim;
-  unsigned long  oldVertex, iPoint, Point_Normal;
-  long iVertex;
+  unsigned long  oldVertex, iPoint, Point_Normal, iVertex;
   su2double *V_inlet, *V_domain, *Normal;
   unsigned short nSpanWiseSections = config->GetnSpanWiseSections();
 
@@ -4182,7 +4172,7 @@ void CTurbSSTSolver::BC_Inlet_Turbo(CGeometry *geometry, CSolver **solver_contai
     omega_b = rho*kine/(muLam*viscRatio);
 
     /*--- Loop over all the vertices on this boundary marker ---*/
-    for (iVertex = 0; iVertex < geometry->nVertexSpan[val_marker][iSpan]; iVertex++) {
+    for (iVertex = 0; iVertex < geometry->GetnVertexSpan(val_marker,iSpan); iVertex++) {
 
       /*--- find the node related to the vertex ---*/
       iPoint = geometry->turbovertex[val_marker][iSpan][iVertex]->GetNode();
diff --git a/SU2_DEF/include/SU2_DEF.hpp b/SU2_DEF/include/SU2_DEF.hpp
index 596655e00c92..ff0da4a73ad2 100644
--- a/SU2_DEF/include/SU2_DEF.hpp
+++ b/SU2_DEF/include/SU2_DEF.hpp
@@ -30,6 +30,7 @@
 #pragma once
 
 #include "../../Common/include/mpi_structure.hpp"
+#include "../../Common/include/omp_structure.hpp"
 
 #include <cstdlib>
 #include <iostream>
diff --git a/SU2_DEF/src/SU2_DEF.cpp b/SU2_DEF/src/SU2_DEF.cpp
index 5922cb812f99..8d2b0e2fe15b 100644
--- a/SU2_DEF/src/SU2_DEF.cpp
+++ b/SU2_DEF/src/SU2_DEF.cpp
@@ -40,7 +40,12 @@ int main(int argc, char *argv[]) {
   /*--- MPI initialization ---*/
 
 #ifdef HAVE_MPI
-  SU2_MPI::Init(&argc,&argv);
+#ifdef HAVE_OMP
+  int provided;
+  SU2_MPI::Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+#else
+  SU2_MPI::Init(&argc, &argv);
+#endif
   SU2_MPI::Comm MPICommunicator(MPI_COMM_WORLD);
 #else
   SU2_Comm MPICommunicator(0);
diff --git a/SU2_DOT/include/SU2_DOT.hpp b/SU2_DOT/include/SU2_DOT.hpp
index 09dfe1ab2647..b2c2e4e28479 100644
--- a/SU2_DOT/include/SU2_DOT.hpp
+++ b/SU2_DOT/include/SU2_DOT.hpp
@@ -30,6 +30,7 @@
 #pragma once
 
 #include "../../Common/include/mpi_structure.hpp"
+#include "../../Common/include/omp_structure.hpp"
 
 #include <cstdlib>
 #include <iostream>
diff --git a/SU2_DOT/src/SU2_DOT.cpp b/SU2_DOT/src/SU2_DOT.cpp
index 12d5b0eb5971..66e251d84861 100644
--- a/SU2_DOT/src/SU2_DOT.cpp
+++ b/SU2_DOT/src/SU2_DOT.cpp
@@ -45,7 +45,12 @@ int main(int argc, char *argv[]) {
   /*--- MPI initialization, and buffer setting ---*/
 
 #ifdef HAVE_MPI
-  SU2_MPI::Init(&argc,&argv);
+#ifdef HAVE_OMP
+  int provided;
+  SU2_MPI::Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+#else
+  SU2_MPI::Init(&argc, &argv);
+#endif
   SU2_MPI::Comm MPICommunicator(MPI_COMM_WORLD);
 #else
   SU2_Comm MPICommunicator(0);
diff --git a/TestCases/parallel_regression_AD.py b/TestCases/parallel_regression_AD.py
index 1564515e38f2..b23394be7ac2 100644
--- a/TestCases/parallel_regression_AD.py
+++ b/TestCases/parallel_regression_AD.py
@@ -229,7 +229,7 @@ def main():
     discadj_fea.cfg_dir   = "disc_adj_fea"
     discadj_fea.cfg_file  = "configAD_fem.cfg" 
     discadj_fea.test_iter = 9
-    discadj_fea.test_vals = [-6.492475, -6.401201, -0.000364, -8.708700] #last 4 columns
+    discadj_fea.test_vals = [-6.070230, -6.262517, -0.000364, -8.708700] #last 4 columns
     discadj_fea.su2_exec  = "parallel_computation.py -f"
     discadj_fea.timeout   = 1600
     discadj_fea.tol       = 0.00001
diff --git a/TestCases/pastix_support/readme.txt b/TestCases/pastix_support/readme.txt
index a2af3ff3a4d9..04fe6f19b349 100644
--- a/TestCases/pastix_support/readme.txt
+++ b/TestCases/pastix_support/readme.txt
@@ -2,7 +2,7 @@
 % SU2 configuration file                                                 %
 % PaStiX support build instructions.                                     %
 % Institution: Imperial College London                                   %
-% File Version 7.0.0 "Blackbird"                                            %
+% File Version 7.0.0 "Blackbird"                                         %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %
 % 1 - Download
diff --git a/TestCases/serial_regression_AD.py b/TestCases/serial_regression_AD.py
index 86f241a6838b..ab2d9f613812 100644
--- a/TestCases/serial_regression_AD.py
+++ b/TestCases/serial_regression_AD.py
@@ -214,7 +214,7 @@ def main():
     discadj_fea.cfg_dir   = "disc_adj_fea"
     discadj_fea.cfg_file  = "configAD_fem.cfg" 
     discadj_fea.test_iter = 9
-    discadj_fea.test_vals = [-6.319841, -6.375512, -0.000364, -8.708700] #last 4 columns
+    discadj_fea.test_vals = [-6.352150, -6.402687, -0.000364, -8.708700] #last 4 columns
     discadj_fea.su2_exec  = "SU2_CFD_AD"
     discadj_fea.timeout   = 1600
     discadj_fea.tol       = 0.00001