su2code · pcarruscag · Mar 26, 2020 · Feb 27, 2020 · Feb 28, 2020 · Feb 28, 2020
diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
@@ -1621,7 +1621,7 @@ class CGeometry {
    * \param[in] fillLvl - Level of fill of the pattern.
    * \return Reference to the sparse pattern.
    */
-  const CCompressedSparsePatternUL& GetSparsePattern(ConnectivityType type, unsigned long fillLvl);
+  const CCompressedSparsePatternUL& GetSparsePattern(ConnectivityType type, unsigned long fillLvl = 0);
 
   /*!
    * \brief Get the edge to sparse pattern map.
@@ -1630,12 +1630,25 @@ class CGeometry {
    */
   const CEdgeToNonZeroMapUL& GetEdgeToSparsePatternMap(void);
 
+  /*!
+   * \brief Get the transpose of the (main, i.e 0 fill) sparse pattern (e.g. CSR becomes CSC).
+   * \param[in] type - Finite volume or finite element.
+   * \return Reference to the map.
+   */
+  const su2vector<unsigned long>& GetTransposeSparsePatternMap(ConnectivityType type);
+
   /*!
    * \brief Get the edge coloring.
    * \note This method computes the coloring if that has not been done yet.
+   * \param[out] efficiency - optional output of the coloring efficiency.
    * \return Reference to the coloring.
    */
-  const CCompressedSparsePatternUL& GetEdgeColoring(void);
+  const CCompressedSparsePatternUL& GetEdgeColoring(su2double* efficiency = nullptr);
+
+  /*!
+   * \brief Force the natural (sequential) edge coloring.
+   */
+  void SetNaturalEdgeColoring();
 
   /*!
    * \brief Get the group size used in edge coloring.
@@ -1646,9 +1659,15 @@ class CGeometry {
   /*!
    * \brief Get the element coloring.
    * \note This method computes the coloring if that has not been done yet.
+   * \param[out] efficiency - optional output of the coloring efficiency.
    * \return Reference to the coloring.
    */
-  const CCompressedSparsePatternUL& GetElementColoring(void);
+  const CCompressedSparsePatternUL& GetElementColoring(su2double* efficiency = nullptr);
+
+  /*!
+   * \brief Force the natural (sequential) element coloring.
+   */
+  void SetNaturalElementColoring();
 
   /*!
    * \brief Get the group size used in element coloring.

diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -95,7 +95,7 @@ class CSysMatrix {
   const unsigned long *row_ptr;     /*!< \brief Pointers to the first element in each row. */
   const unsigned long *dia_ptr;     /*!< \brief Pointers to the diagonal element in each row. */
   const unsigned long *col_ind;     /*!< \brief Column index for each of the elements in val(). */
-  vector<const ScalarType*> col_ptr;/*!< \brief The transpose of col_ind, pointer to blocks with the same column index. */
+  const unsigned long *col_ptr;     /*!< \brief The transpose of col_ind, pointer to blocks with the same column index. */
 
   ScalarType *ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -349,10 +349,12 @@ class CSysMatrix {
    * \param[in] neqn - Number of equations.
    * \param[in] geometry - Geometrical definition of the problem.
    * \param[in] config - Definition of the particular problem.
+   * \param[in] needTranspPtr - If "col_ptr" should be created.
    */
   void Initialize(unsigned long npoint, unsigned long npointdomain,
                   unsigned short nvar, unsigned short neqn,
-                  bool EdgeConnect, CGeometry *geometry, CConfig *config);
+                  bool EdgeConnect, CGeometry *geometry,
+                  CConfig *config, bool needTranspPtr = false);
 
   /*!
    * \brief Sets to zero all the entries of the sparse matrix.
@@ -584,11 +586,13 @@ class CSysMatrix {
    * \brief Update 2 blocks ij and ji (add to i* sub from j*).
    * \note The template parameter Sign, can be used create a "subtractive"
    *       update i.e. subtract from row i and add to row j instead.
+   *       The parameter Overwrite allows completely writing over the
+   *       current values held by the matrix.
    * \param[in] edge - Index of edge that connects iPoint and jPoint.
    * \param[in] block_i - Subs from ji.
    * \param[in] block_j - Adds to ij.
    */
-  template<class OtherType, int Sign = 1>
+  template<class OtherType, int Sign = 1, bool Overwrite = false>
   inline void UpdateBlocks(unsigned long iEdge, const OtherType* const* block_i, const OtherType* const* block_j) {
 
     ScalarType *bij = &matrix[edge_ptr(iEdge,0)*nVar*nEqn];
@@ -598,8 +602,8 @@ class CSysMatrix {
 
     for (iVar = 0; iVar < nVar; iVar++) {
       for (jVar = 0; jVar < nEqn; jVar++) {
-        bij[offset] += PassiveAssign<ScalarType,OtherType>(block_j[iVar][jVar]) * Sign;
-        bji[offset] -= PassiveAssign<ScalarType,OtherType>(block_i[iVar][jVar]) * Sign;
+        bij[offset] = (Overwrite? ScalarType(0) : bij[offset]) + PassiveAssign<ScalarType,OtherType>(block_j[iVar][jVar]) * Sign;
+        bji[offset] = (Overwrite? ScalarType(0) : bji[offset]) - PassiveAssign<ScalarType,OtherType>(block_i[iVar][jVar]) * Sign;
         ++offset;
       }
     }
@@ -613,6 +617,14 @@ class CSysMatrix {
     UpdateBlocks<OtherType,-1>(iEdge, block_i, block_j);
   }
 
+  /*!
+   * \brief Short-hand for the "additive overwrite" version of UpdateBlocks.
+   */
+  template<class OtherType>
+  inline void SetBlocks(unsigned long iEdge, const OtherType* const* block_i, const OtherType* const* block_j) {
+    UpdateBlocks<OtherType,1,true>(iEdge, block_i, block_j);
+  }
+
   /*!
    * \brief Adds the specified block to the (i, i) subblock of the matrix-by-blocks structure.
    * \param[in] block_i - Diagonal index.

diff --git a/Common/include/omp_structure.hpp b/Common/include/omp_structure.hpp
@@ -80,6 +80,19 @@ inline void omp_set_num_threads(int) { }
  */
 inline constexpr int omp_get_thread_num(void) {return 0;}
 
+/*!
+ * \brief Dummy lock type and associated functions.
+ */
+struct omp_lock_t {};
+struct DummyVectorOfLocks {
+  omp_lock_t l;
+  inline omp_lock_t& operator[](int) {return l;}
+};
+inline void omp_init_lock(omp_lock_t*){}
+inline void omp_set_lock(omp_lock_t*){}
+inline void omp_unset_lock(omp_lock_t*){}
+inline void omp_destroy_lock(omp_lock_t*){}
+
 #endif
 
 /*--- Convenience macros (do not use excessive nesting of macros). ---*/
@@ -108,6 +121,14 @@ inline constexpr size_t roundUpDiv(size_t numerator, size_t denominator)
   return (numerator+denominator-1)/denominator;
 }
 
+/*!
+ * \brief Round up to next multiple.
+ */
+inline constexpr size_t nextMultiple(size_t argument, size_t multiple)
+{
+  return roundUpDiv(argument, multiple) * multiple;
+}
+
 /*!
  * \brief Compute a chunk size based on totalWork and number of threads such that
  *        all threads get the same number of chunks (with limited size).

diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
@@ -133,6 +133,8 @@ const int SU2_CONN_SIZE   = 10;  /*!< \brief Size of the connectivity array that
                                              that we read from a mesh file in the format [[globalID vtkType n0 n1 n2 n3 n4 n5 n6 n7 n8]. */
 const int SU2_CONN_SKIP   = 2;   /*!< \brief Offset to skip the globalID and VTK type at the start of the element connectivity list for each CGNS element. */
 
+const su2double COLORING_EFF_THRESH = 0.875;  /*!< \brief Below this value fallback strategies are used instead. */
+
 /*!
  * \brief Boolean answers
  */

diff --git a/Common/include/toolboxes/C1DInterpolation.hpp b/Common/include/toolboxes/C1DInterpolation.hpp
@@ -84,11 +84,6 @@ class CAkimaInterpolation final: public C1DInterpolation{
       SetSpline(X,Data);
   }
 
-  /*!
-   * \brief Destructor of the CAkimaInterpolation class.
-   */
-  ~CAkimaInterpolation(){}
-
   /*!
    * \brief for setting the cofficients for the Akima spline.
    * \param[in] X - the x values.
@@ -119,11 +114,6 @@ class CLinearInterpolation final: public C1DInterpolation{
       SetSpline(X,Data);
   }
 
-  /*!
-   * \brief Destructor of the CInletInterpolation class.
-   */
-  ~CLinearInterpolation(){}
-
   /*!
    * \brief for setting the cofficients for Linear 'spline'.
    * \param[in] X - the x values.

diff --git a/Common/include/toolboxes/graph_toolbox.hpp b/Common/include/toolboxes/graph_toolbox.hpp
@@ -28,6 +28,7 @@
 #pragma once
 
 #include "C2DContainer.hpp"
+#include "../omp_structure.hpp"
 
 #include <set>
 #include <vector>
@@ -59,6 +60,7 @@ class CCompressedSparsePattern {
   su2vector<Index_t> m_outerPtr; /*!< \brief Start positions of the inner indices for each outer index. */
   su2vector<Index_t> m_innerIdx; /*!< \brief Inner indices of the non zero entries. */
   su2vector<Index_t> m_diagPtr;  /*!< \brief Position of the diagonal entry. */
+  su2vector<Index_t> m_innerIdxTransp; /*!< \brief Position of the transpose non zero entries, requires symmetry. */
 
 public:
   using IndexType = Index_t;
@@ -107,10 +109,30 @@ class CCompressedSparsePattern {
     if(!m_diagPtr.empty()) return;
 
     m_diagPtr.resize(getOuterSize());
+
+    SU2_OMP_PARALLEL_(for schedule(static,roundUpDiv(getOuterSize(),omp_get_max_threads())))
     for(Index_t k = 0; k < getOuterSize(); ++k)
       m_diagPtr(k) = findInnerIdx(k,k);
   }
 
+  /*!
+   * \brief Build a list of pointers to the transpose entries of the pattern, requires symmetry.
+   */
+  void buildTransposePtr() {
+    if(!m_innerIdxTransp.empty()) return;
+
+    m_innerIdxTransp.resize(getNumNonZeros());
+
+    SU2_OMP_PARALLEL_(for schedule(static,roundUpDiv(getOuterSize(),omp_get_max_threads())))
+    for(Index_t i = 0; i < getOuterSize(); ++i) {
+      for(Index_t k = m_outerPtr(i); k < m_outerPtr(i+1); ++k) {
+        auto j = m_innerIdx(k);
+        m_innerIdxTransp(k) = findInnerIdx(j,i);
+        assert(m_innerIdxTransp(k) != m_innerIdx.size() && "The pattern is not symmetric.");
+      }
+    }
+  }
+
   /*!
    * \return True if the pattern is empty, i.e. has not been built yet.
    */
@@ -224,6 +246,14 @@ class CCompressedSparsePattern {
     return m_diagPtr.data();
   }
 
+  /*!
+   * \return Raw pointer to the transpose pointer vector.
+   */
+  inline const su2vector<Index_t>& transposePtr() const {
+    assert(!m_innerIdxTransp.empty() && "Transpose map has not been built.");
+    return m_innerIdxTransp;
+  }
+
   /*!
    * \return The minimum inner index.
    */
@@ -384,6 +414,30 @@ CEdgeToNonZeroMap<Index_t> mapEdgesToSparsePattern(Geometry_t& geometry,
 }
 
 
+/*!
+ * \brief Create the natural coloring (equivalent to the normal sequential loop
+ *        order) for a given number of inner indexes.
+ * \note This is to reduce overhead in "OpenMP-ready" code when only 1 thread is used.
+ * \param[in] numInnerIndexes - Number of indexes that are to be colored.
+ * \return Natural (sequential) coloring of the inner indices.
+ */
+template<class T = CCompressedSparsePatternUL,
+         class Index_t = typename T::IndexType>
+T createNaturalColoring(Index_t numInnerIndexes)
+{
+  /*--- One color. ---*/
+  su2vector<Index_t> outerPtr(2);
+  outerPtr(0) = 0;
+  outerPtr(1) = numInnerIndexes;
+
+  /*--- Containing all indexes in ascending order. ---*/
+  su2vector<Index_t> innerIdx(numInnerIndexes);
+  std::iota(innerIdx.data(), innerIdx.data()+numInnerIndexes, 0);
+
+  return T(std::move(outerPtr), std::move(innerIdx));
+}
+
+
 /*!
  * \brief Color contiguous groups of outer indices of a sparse pattern such that
  *        within each color, any two groups do not have inner indices in common.
@@ -404,7 +458,7 @@ CEdgeToNonZeroMap<Index_t> mapEdgesToSparsePattern(Geometry_t& geometry,
  * \param[out] indexColor - Optional, vector with colors given to the outer indices.
  * \return Coloring in the same type of the input pattern.
  */
-template<class T, typename Color_t = char, size_t MaxColors = 64, size_t MaxMB = 128>
+template<class T, typename Color_t = char, size_t MaxColors = 32, size_t MaxMB = 128>
 T colorSparsePattern(const T& pattern, size_t groupSize = 1, bool balanceColors = false,
                      std::vector<Color_t>* indexColor = nullptr)
 {
@@ -415,6 +469,10 @@ T colorSparsePattern(const T& pattern, size_t groupSize = 1, bool balanceColors
 
   const Index_t grpSz = groupSize;
   const Index_t nOuter = pattern.getOuterSize();
+
+  /*--- Trivial case. ---*/
+  if(groupSize >= nOuter) return createNaturalColoring(nOuter);
+
   const Index_t minIdx = pattern.getMinInnerIdx();
   const Index_t nInner = pattern.getMaxInnerIdx()+1-minIdx;
 
@@ -520,30 +578,6 @@ T colorSparsePattern(const T& pattern, size_t groupSize = 1, bool balanceColors
 }
 
 
-/*!
- * \brief Create the natural coloring (equivalent to the normal sequential loop
- *        order) for a given number of inner indexes.
- * \note This is to reduce overhead in "OpenMP-ready" code when only 1 thread is used.
- * \param[in] numInnerIndexes - Number of indexes that are to be colored.
- * \return Natural (sequential) coloring of the inner indices.
- */
-template<class T = CCompressedSparsePatternUL,
-         class Index_t = typename T::IndexType>
-T createNaturalColoring(Index_t numInnerIndexes)
-{
-  /*--- One color. ---*/
-  su2vector<Index_t> outerPtr(2);
-  outerPtr(0) = 0;
-  outerPtr(1) = numInnerIndexes;
-
-  /*--- Containing all indexes in ascending order. ---*/
-  su2vector<Index_t> innerIdx(numInnerIndexes);
-  std::iota(innerIdx.data(), innerIdx.data()+numInnerIndexes, 0);
-
-  return T(std::move(outerPtr), std::move(innerIdx));
-}
-
-
 /*!
  * \brief A way to represent one grid color that allows range-for syntax.
  */
@@ -553,9 +587,11 @@ struct GridColor
   static_assert(std::is_integral<T>::value,"");
 
   const T size;
+  T groupSize;
   const T* const indices;
 
-  GridColor(const T* idx = nullptr, T sz = 0) : size(sz), indices(idx) { }
+  GridColor(const T* idx = nullptr, T sz = 0, T grp = 0) :
+    size(sz), groupSize(grp), indices(idx) { }
 
   inline const T* begin() const {return indices;}
   inline const T* end() const {return indices+size;}
@@ -592,3 +628,23 @@ struct DummyGridColor
   inline IteratorLikeInt begin() const {return IteratorLikeInt(0);}
   inline IteratorLikeInt end() const {return IteratorLikeInt(size);}
 };
+
+
+/*!
+ * \brief Computes the efficiency of a grid coloring for given number of threads and chunk size.
+ */
+template<class SparsePattern>
+su2double coloringEfficiency(const SparsePattern& coloring, int numThreads, int chunkSize)
+{
+  using Index_t = typename SparsePattern::IndexType;
+
+  /*--- Ideally compute time is proportional to total work over number of threads. ---*/
+  su2double ideal = coloring.getNumNonZeros() / su2double(numThreads);
+
+  /*--- In practice the total work is quantized first by colors and then by chunks. ---*/
+  Index_t real = 0;
+  for(Index_t color = 0; color < coloring.getOuterSize(); ++color)
+    real += chunkSize * roundUpDiv(roundUpDiv(coloring.getNumNonZeros(color), chunkSize), numThreads);
+
+  return ideal / real;
+}