From 35785c226c93d8586d3c55aa2b496b3a7b67d2a3 Mon Sep 17 00:00:00 2001 From: Tim Fuller Date: Fri, 20 Sep 2024 11:15:51 -0600 Subject: [PATCH 01/23] treat copy and permute for the special case that both the source and target matrices are locally indexed --- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 95 +++++++++++++++---- 1 file changed, 79 insertions(+), 16 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index f0eef6b3b32e..475d6c9303c5 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -5688,6 +5688,7 @@ CrsMatrix:: verbose ? prefix.get()->c_str() : nullptr; const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); + const bool targetIsLocallyIndexed = this->isLocallyIndexed (); // // Copy the first numSame row from source to target (this matrix). // This involves copying rows corresponding to LIDs [0, numSame-1]. @@ -5696,17 +5697,65 @@ CrsMatrix:: nonconst_global_inds_host_view_type rowInds; nonconst_values_host_view_type rowVals; const LO numSameIDs_as_LID = static_cast (numSameIDs); - for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { - // Global ID for the current row index in the source matrix. - // The first numSameIDs GIDs in the two input lists are the - // same, so sourceGID == targetGID in this case. - const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); - const GO targetGID = sourceGID; - ArrayViewrowIndsConstView; - ArrayView rowValsConstView; + if (targetIsLocallyIndexed && sourceIsLocallyIndexed) { + // Create a mapping from the source's local column id's to my local column ids + using DT = typename Node::device_type; + const map_type& src_col_map = *(srcMat.getColMap()); + const map_type& tgt_col_map = *(this->getColMap()); + + auto local_src_col_map = src_col_map.getLocalMap(); + auto local_tgt_col_map = tgt_col_map.getLocalMap(); + + auto invalid = Teuchos::OrdinalTraits::invalid(); + auto num_src_cols = src_col_map.getLocalNumElements(); + Kokkos::UnorderedMap lid_map(num_src_cols); + for (int src_local_col_idx=0; src_local_col_idxgetLocalRowView(local_row, tgt_local_cols, tgt_local_vals); + + Kokkos::View indices("tgt_local_cols", src_local_cols.extent(0)); + Kokkos::View values("tgt_local_vals", src_local_cols.extent(0)); + int idx = 0; + for (int offset=0; offsetreplaceLocalValues(local_row, inds, vals); + } + } else if (sourceIsLocallyIndexed) { + for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { + // Global ID for the current row index in the source matrix. + // The first numSameIDs GIDs in the two input lists are the + // same, so sourceGID == targetGID in this case. + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); + const GO targetGID = sourceGID; + + ArrayViewrowIndsConstView; + ArrayView rowValsConstView; - if (sourceIsLocallyIndexed) { const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); if (rowLength > static_cast (rowInds.size())) { Kokkos::resize(rowInds,rowLength); @@ -5744,8 +5793,23 @@ CrsMatrix:: Teuchos::RCP_DISABLE_NODE_LOOKUP); // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with // KDDKDD UVM TEMPORARY: KokkosView interface + // Applying a permutation to a matrix with a static graph + // means REPLACE-ing entries. + combineGlobalValues(targetGID, rowIndsConstView, + rowValsConstView, REPLACE, + prefix_raw, debug, verbose); } - else { // source matrix is globally indexed. + } else { + for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { + // Global ID for the current row index in the source matrix. + // The first numSameIDs GIDs in the two input lists are the + // same, so sourceGID == targetGID in this case. + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); + const GO targetGID = sourceGID; + + ArrayViewrowIndsConstView; + ArrayView rowValsConstView; + global_inds_host_view_type rowIndsView; values_host_view_type rowValsView; srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); @@ -5762,13 +5826,12 @@ CrsMatrix:: // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with // KDDKDD UVM TEMPORARY: KokkosView interface + // Applying a permutation to a matrix with a static graph + // means REPLACE-ing entries. + combineGlobalValues(targetGID, rowIndsConstView, + rowValsConstView, REPLACE, + prefix_raw, debug, verbose); } - - // Applying a permutation to a matrix with a static graph - // means REPLACE-ing entries. - combineGlobalValues(targetGID, rowIndsConstView, - rowValsConstView, REPLACE, - prefix_raw, debug, verbose); } if (verbose) { From b7ce02ca9f1e92562c631fb6a4c917f971330b5c Mon Sep 17 00:00:00 2001 From: srkenno Date: Thu, 7 Nov 2024 08:49:39 -0700 Subject: [PATCH 02/23] draft: testing access to device pointers for column indices and values --- .../src/copyAndPermuteStaticGraph_new.hpp | 431 ++++++++++++++++++ 1 file changed, 431 insertions(+) create mode 100644 packages/tpetra/core/src/copyAndPermuteStaticGraph_new.hpp diff --git a/packages/tpetra/core/src/copyAndPermuteStaticGraph_new.hpp b/packages/tpetra/core/src/copyAndPermuteStaticGraph_new.hpp new file mode 100644 index 000000000000..2f619e6dd8c2 --- /dev/null +++ b/packages/tpetra/core/src/copyAndPermuteStaticGraph_new.hpp @@ -0,0 +1,431 @@ + // not yet a member function of CrsMatrix + template + void + copyAndPermuteStaticGraph_new( + const RowMatrix& srcMat, + RowMatrix& tgtMat, + const size_t numSameIDs, + const LocalOrdinal permuteToLIDs[], + const LocalOrdinal permuteFromLIDs[], + const size_t numPermutes) + { + using Details::ProfilingRegion; + using Teuchos::Array; + //using Teuchos::ArrayView; + using std::endl; + using LO = LocalOrdinal; + using GO = GlobalOrdinal; + + using impl_scalar_type = typename Kokkos::ArithTraits::val_type; + typedef typename Kokkos::View::non_const_type nonconst_values_device_view_type; + typedef typename Kokkos::View::non_const_type nonconst_global_inds_device_view_type; + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + + const char tfecfFuncName[] = "copyAndPermuteStaticGraph"; + ProfilingRegion regionCAP + ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph"); + + // const bool debug = Details::Behavior::debug("CrsGraph"); + // const bool verbose = Details::Behavior::verbose("CrsGraph"); + + using crs_matrix_type = CrsMatrix; + + const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); + if (!srcMatCrsPtr) { + std::cout << "srk error srcMat type= " << typeid(srcMat).name() << std::endl; + std::terminate(); + } + const crs_matrix_type& srcMatCrs = *srcMatCrsPtr; + + crs_matrix_type *tgtMatCrsPtr = dynamic_cast(&tgtMat); + if (!tgtMatCrsPtr) { + std::cout << "srk error tgtMat type= " << typeid(tgtMat).name() << std::endl; + std::terminate(); + } + crs_matrix_type& tgtMatCrs = *tgtMatCrsPtr; + + std::string prefix = tfecfFuncName; + // const char* const prefix_raw = prefix.c_str(); + + const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); + //const bool targetIsLocallyIndexed = tgtMat.isLocallyIndexed (); + // + // Copy the first numSame row from source to target (this matrix). + // This involves copying rows corresponding to LIDs [0, numSame-1]. + // + const auto& srcRowMap = * (srcMat.getRowMap ()); + auto comm = srcRowMap.getComm(); + + + const LO numSameIDs_as_LID = static_cast (numSameIDs); + + if (sourceIsLocallyIndexed) { + + typedef typename crs_matrix_type::local_matrix_device_type k_local_matrix_device_type; + //typedef typename k_local_matrix_device_type::StaticCrsGraphType k_graph_t; + // typedef typename k_graph_t::row_map_type::non_const_type k_row_map_t; + // typedef typename k_graph_t::entries_type::non_const_type k_nnz_t; + // typedef typename k_local_matrix_device_type::values_type::non_const_type k_scalar_view_t; + + const k_local_matrix_device_type & srcMatDevice = srcMatCrs.getLocalMatrixDevice(); + const k_local_matrix_device_type & tgtMatDevice = tgtMatCrs.getLocalMatrixDevice(); + const k_local_matrix_device_type * srcMatDevicePtr = &srcMatDevice; + const k_local_matrix_device_type * tgtMatDevicePtr = &tgtMatDevice; + + // auto lclIndsUnpacked_device = srcMat.getLocalIndicesDevice (); + + // auto nr = srcMatDevice.graph.numRows(); + // // if ((size_t)numSameIDs_as_LID >= (size_t)nr) { + // // std::cout << "numSameIDs_as_LID= " << numSameIDs_as_LID << " nr= " << nr << std::endl; + // // } + // TEUCHOS_ASSERT((size_t)numSameIDs_as_LID <= (size_t)nr); + +#define PR1(a) std::cout << "[srk] " << #a << "= " << a << std::endl + + typename crs_matrix_type::row_ptrs_device_view_type tgtLocalRowPtrsDevice = tgtMatCrs.getLocalRowPtrsDevice(); + typename crs_matrix_type::local_inds_device_view_type tgtLocalColIndsDevice = tgtMatCrs.getLocalIndicesDevice(); + typename crs_matrix_type::row_ptrs_host_view_type srcLocalRowPtrsHost = srcMatCrs.getLocalRowPtrsHost(); + typename crs_matrix_type::row_ptrs_device_view_type srcLocalRowPtrsDevice = srcMatCrs.getLocalRowPtrsDevice(); + typename crs_matrix_type::local_inds_device_view_type srcLocalColIndsDevice = srcMatCrs.getLocalIndicesDevice(); + + nonconst_global_inds_device_view_type srowInfo(Kokkos::ViewAllocateWithoutInitializing("srowInfo"), numSameIDs_as_LID); + + printf("here fence 0 numSameIDs_as_LID= %ld\n", numSameIDs_as_LID); + Kokkos::fence("srk0"); + printf("here fence 1\n"); + + typedef typename Node::execution_space exec_space; + typedef Kokkos::RangePolicy range_type; + + size_t mre=0; + for (LO sourceLID=0; sourceLID < numSameIDs_as_LID; sourceLID++) { + auto start = srcLocalRowPtrsHost(sourceLID); + auto end = srcLocalRowPtrsHost(sourceLID+1); + size_t rowLength = static_cast(end - start); + printf("sourceLID= %d start= %d end= %d rowLength= %ld\n", sourceLID, start, end, rowLength); + if (rowLength > mre) mre = rowLength; + } + printf("here b4 row_map, max_row_entries=%ld\n", mre); // prints 33 + Kokkos::parallel_for + ("Tpetra_CrsMatrix::copyAndPermuteStaticGraph", + range_type (0, numSameIDs_as_LID), + KOKKOS_LAMBDA(const LO sourceLID) + { + auto start = srcMatDevice.graph.row_map(sourceLID); // always print 0 + auto end = srcMatDevice.graph.row_map(sourceLID+1); // these print correctly + size_t rowLength = static_cast(end - start); + printf("0 k_sourceLID= %d start= %d end= %d rowLength= %ld\n", sourceLID, start, end, rowLength); + //printf("k_sourceLID= %d\n", sourceLID); + //srowInfo(sourceLID) = rowLength; + }); // kokkos parallel_for + + printf("here fence 2.0\n"); + Kokkos::fence("srk00"); + printf("here fence 2\n"); + + printf("here b4 srowInfo, max_row_entries=%ld\n", mre); + Kokkos::parallel_for + ("Tpetra_CrsMatrix::copyAndPermuteStaticGraph", + range_type (0, numSameIDs_as_LID), + KOKKOS_LAMBDA(const LO sourceLID) + { + auto start = srcLocalRowPtrsDevice(sourceLID); + auto end = srcLocalRowPtrsDevice(sourceLID+1); + size_t rowLength = static_cast(end - start); + printf("1 k_sourceLID= %d start= %d end= %d rowLength= %ld\n", sourceLID, start, end, rowLength); + //printf("k_sourceLID= %d\n", sourceLID); + //srowInfo(sourceLID) = rowLength; + }); // kokkos parallel_for + + printf("here fence 2.0\n"); + Kokkos::fence("srk00"); + printf("here fence 2\n"); + + size_t max_row_entries = 0; + Kokkos::parallel_reduce ("Tpetra_CrsMatrix_capsg_get_max_nc", range_type (0, numSameIDs_as_LID), + KOKKOS_LAMBDA(const LO sourceLID, size_t& gmax) { + auto start = srcLocalRowPtrsDevice(sourceLID); + auto end = srcLocalRowPtrsDevice(sourceLID+1); + size_t ct = static_cast(end - start); + + if (ct > gmax) gmax = ct; + }, max_row_entries); + + printf("here 0-af-pr: max_row_entries= %ld mre= %ld\n", max_row_entries, mre); + max_row_entries = mre; + + Kokkos::fence("srk1"); + printf("here 0-af-fence:\n"); + + auto local_map = srcMat.getRowMap()->getLocalMap(); + auto local_map_ptr = &local_map; + auto local_col_map = srcMat.getColMap()->getLocalMap(); + auto local_col_map_ptr = &local_col_map; + + nonconst_global_inds_device_view_type rowInds(Kokkos::ViewAllocateWithoutInitializing("srk_rowInds"), max_row_entries); + nonconst_values_device_view_type rowVals(Kokkos::ViewAllocateWithoutInitializing("srk_rowVals"), max_row_entries); + + bool tgtMatIsSorted = tgtMatCrs.getCrsGraph()->isSorted(); + + using local_map_type = typename crs_matrix_type::map_type::local_map_type; + //using crs_graph_type = CrsGraph; + + local_map_type tgt_local_map = tgtMatCrs.getRowMap()->getLocalMap(); + local_map_type tgt_local_col_map = tgtMatCrs.getColMap()->getLocalMap(); + auto tgt_local_map_ptr = &tgt_local_map; + auto tgt_local_col_map_ptr = &tgt_local_col_map; + + auto my_replaceGlobalValuesImpl + = KOKKOS_LAMBDA( + const bool sorted, const bool atomic, size_t hint[], + const size_t numInTgtRow, const LO tgtColInds[], impl_scalar_type tgtRowVals[], + const size_t numToReplace, const GO inds[], const impl_scalar_type newVals[] + ) -> LO + { + LO numValid = 0; // number of valid input column indices + + if (atomic) { + for (LO j = 0; j < (LO)numToReplace; ++j) { + const LO lclColInd = tgt_local_col_map_ptr->getLocalElement(inds[j]); + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint[0], sorted); + if (offset != numInTgtRow) { + Kokkos::atomic_store (&tgtRowVals[offset], newVals[j]); + hint[0] = offset + 1; + numValid++; + } + } + } + } else { + for (LO j = 0; j < (LO)numToReplace; ++j) { + const LO lclColInd = tgt_local_col_map_ptr->getLocalElement (inds[j]); + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint[0], sorted); + if (offset != numInTgtRow) { + tgtRowVals[offset] = newVals[j]; + hint[0] = offset + 1; + numValid++; + } + } + } + } + return numValid; + }; + + printf("here 0: R: %d row_map.size= %d numSameIDs_as_LID= %d\n", comm->getRank(), srcMatDevicePtr->graph.row_map.extent(0), numSameIDs_as_LID); + + printf("here 001: %d\n", comm->getRank()); + Kokkos::fence("srk01"); + printf("here 002: %d\n", comm->getRank()); + + const GO rl0 = + Tpetra::Details::getEntryOnHost(srowInfo, 0); + + printf("here 01: %d %ld\n", comm->getRank(), rl0); + + auto vals = srcMatCrs.getLocalValuesDevice (Access::ReadOnly); + auto tvals = tgtMatCrs.getLocalValuesDevice (Access::ReadWrite); + + Kokkos::fence("srk01"); + + Kokkos::parallel_for + ("Tpetra_CrsMatrix::copyAndPermuteStaticGraph", + range_type (0, numSameIDs_as_LID), + KOKKOS_LAMBDA(const LO sourceLID) + { + //printf("sourceLID= %d\n", sourceLID); + + auto start = srcLocalRowPtrsDevice(sourceLID); + auto end = srcLocalRowPtrsDevice(sourceLID+1); + size_t rowLength = static_cast(end - start); + // srowInfo(sourceLID) = rowLength; + [[maybe_unused]] size_t checkRowLength = 0; + + const size_t numEntries = rowLength; + //auto dev_row_info = srcMatDevicePtr->row(sourceLID); + + //KOKKOS_ASSERT(dev_row_info.length == rowLength); + + checkRowLength = numEntries; // first side effect + +#ifdef COMP +# undef COMP +#endif +#define COMP(a,b) do { if (int(a) != int(b)) { std::cout << "error: " << #a << "= " << a << " " << #b << "= " << b << " line= " << __LINE__ << std::endl; std::terminate(); } } while(0) + + for (size_t j = 0; j < rowLength; j++) { + // //auto ci = dev_row_info.colidx(j); + auto ci = srcLocalColIndsDevice(start + j); + auto gi = local_col_map_ptr->getGlobalElement(ci); + // rowInds(j) = gi; + // rowVals(j) = vals(start + j); + } + + // auto tgt_dev_row_info = tgtMatDevicePtr->row(sourceLID); + // LO *tgtColInds = &tgt_dev_row_info.colidx(0); + // Scalar *tgtRowVals = &tgt_dev_row_info.value(0); + // size_t numInTgtRow = tgt_dev_row_info.length; + auto tstart = tgtLocalRowPtrsDevice(sourceLID); + auto tend = tgtLocalRowPtrsDevice(sourceLID + 1); + size_t numInTgtRow = static_cast(tend - tstart); + Scalar *tgtRowVals = &tvals(tstart); + const LO *tgtColInds = &tgtLocalColIndsDevice(tstart); + + size_t hint=0; + // my_replaceGlobalValuesImpl(tgtMatIsSorted, false, &hint, // tgt_local_col_map, + // numInTgtRow, tgtColInds, tgtRowVals, + // rowLength, rowInds.data(), rowVals.data()); + + }); // kokkos parallel_for + + + printf("here 02: %d\n", comm->getRank()); + + Kokkos::fence("srk02"); + + { + bool tsd = tgtMatCrs.get_values_unpacked_wdv().need_sync_device(); + bool tsh = tgtMatCrs.get_values_unpacked_wdv().need_sync_host(); + bool ssd = srcMatCrs.get_values_unpacked_wdv().need_sync_device(); + bool ssh = srcMatCrs.get_values_unpacked_wdv().need_sync_host(); + + PR1(tsd); + PR1(tsh); + PR1(ssd); + PR1(ssh); + } + + // if (tsd) tgtMatCrs.get_values_unpacked_wdv().sync_device(); + // if (tsh) tgtMatCrs.get_values_unpacked_wdv().sync_host(); + // if (ssd) srcMatCrs.get_values_unpacked_wdv().sync_device(); + // if (ssh) srcMatCrs.get_values_unpacked_wdv().sync_host(); + // auto tgtSyncView = tgtMatCrs.get_values_unpacked_wdv().getHostView(Access::ReadOnly); + // const RowInfo rowInfo = tgtMatCrs.getCrsGraph()->getRowInfo(0); + // auto t2 = tgtMatCrs.getCrsGraph()->getLocalIndsViewHost(rowInfo); + // std::cout << "[srk] tgtSyncView= " << tgtSyncView.extent(0) << " t2= " << t2.extent(0) << std::endl; + + Kokkos::fence("srk2"); + + { + bool tsd = tgtMatCrs.get_values_unpacked_wdv().need_sync_device(); + bool tsh = tgtMatCrs.get_values_unpacked_wdv().need_sync_host(); + bool ssd = srcMatCrs.get_values_unpacked_wdv().need_sync_device(); + bool ssh = srcMatCrs.get_values_unpacked_wdv().need_sync_host(); + + PR1(tsd); + PR1(tsh); + PR1(ssd); + PR1(ssh); + } + + printf("here 1: %d\n", comm->getRank()); + + + } else { + for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { + // Global ID for the current row index in the source matrix. + // The first numSameIDs GIDs in the two input lists are the + // same, so sourceGID == targetGID in this case. + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); + const GO targetGID = sourceGID; + + Teuchos::ArrayView rowIndsConstView; + Teuchos::ArrayView rowValsConstView; + + typename crs_matrix_type::global_inds_host_view_type rowIndsView; + typename crs_matrix_type::values_host_view_type rowValsView; + srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface + + // Applying a permutation to a matrix with a static graph + // means REPLACE-ing entries. + tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, + rowValsConstView); + } + } + + // + // "Permute" part of "copy and permute." + // + + typename crs_matrix_type::nonconst_global_inds_host_view_type rowInds; + typename crs_matrix_type::nonconst_values_host_view_type rowVals; + + const auto& tgtRowMap = * (tgtMat.getRowMap ()); + for (size_t p = 0; p < numPermutes; ++p) { + const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]); + const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]); + + Teuchos::ArrayView rowIndsConstView; + Teuchos::ArrayView rowValsConstView; + + if (sourceIsLocallyIndexed) { + const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); + if (rowLength > static_cast (rowInds.size ())) { + Kokkos::resize(rowInds,rowLength); + Kokkos::resize(rowVals,rowLength); + } + // Resizing invalidates an Array's views, so we must make new + // ones, even if rowLength hasn't changed. + typename crs_matrix_type::nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + typename crs_matrix_type::nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + + // The source matrix is locally indexed, so we have to get a + // copy. Really it's the GIDs that have to be copied (because + // they have to be converted from LIDs). + size_t checkRowLength = 0; + srcMat.getGlobalRowCopy(sourceGID, rowIndsView, + rowValsView, checkRowLength); + + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface + } + else { + typename crs_matrix_type::global_inds_host_view_type rowIndsView; + typename crs_matrix_type::values_host_view_type rowValsView; + srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface + } + + tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, + rowValsConstView); + } + + } From 53ae2cc8b019ab0bfedcef9aaf92ff3f237a7cf7 Mon Sep 17 00:00:00 2001 From: srkenno Date: Wed, 13 Nov 2024 10:50:37 -0700 Subject: [PATCH 03/23] Address performance issues in CrsMatrix copyAndPermute: 1) timers 2) batch version of Map::getGlobalElement 3) improved replaceGlobalValues 4) kokkos parallel reduce/for for the main loop over rows in CAP 5) small improvements using if constexpr() --- .../src/Panzer_ModelEvaluator_impl.hpp | 9 +- .../panzer/mini-em/example/BlockPrec/main.cpp | 97 ++- .../BlockPrec/maxwell-analyticSolution.xml | 2 +- .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 84 ++- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 631 ++++++++++++++++-- .../src/Tpetra_Details_WrappedDualView.hpp | 16 +- .../core/src/Tpetra_Details_crsUtils.hpp | 76 ++- packages/tpetra/core/src/Tpetra_Map_decl.hpp | 1 + packages/tpetra/core/src/Tpetra_Map_def.hpp | 40 +- 9 files changed, 867 insertions(+), 89 deletions(-) diff --git a/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp b/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp index 0ee307c88733..07bbc5344489 100644 --- a/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp +++ b/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp @@ -47,6 +47,10 @@ #include "Thyra_TpetraLinearOp.hpp" #include "Tpetra_CrsMatrix.hpp" +extern bool in_eval_J; +extern double timer_evalJ; +extern double timer_capsg; + // Constructors/Initializers/Accessors template @@ -1569,7 +1573,8 @@ evalModelImpl_basic(const Thyra::ModelEvaluatorBase::InArgs &inArgs, else if(Teuchos::is_null(f_out) && !Teuchos::is_null(W_out)) { PANZER_FUNC_TIME_MONITOR("panzer::ModelEvaluator::evalModel(J)"); - + double time_ = Teuchos::Time::wallTime(); + in_eval_J = true; // only add auxiliary global data if Jacobian is being formed ae_inargs.addGlobalEvaluationData(nonParamGlobalEvaluationData_); @@ -1582,6 +1587,8 @@ evalModelImpl_basic(const Thyra::ModelEvaluatorBase::InArgs &inArgs, thGhostedContainer->initializeMatrix(0.0); ae_tm_.template getAsObject()->evaluate(ae_inargs); + in_eval_J = false; + timer_evalJ += -time_ + Teuchos::Time::wallTime(); } // HACK: set A to null before calling responses to avoid touching the diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp index 0efa0fc0879c..93426dead9e2 100644 --- a/packages/panzer/mini-em/example/BlockPrec/main.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp @@ -60,7 +60,12 @@ #include #include - +#include +#include +#include +#include +#include +#include template void writeToExodus(double time_stamp, @@ -93,6 +98,26 @@ using mini_em::physicsType, mini_em::MAXWELL, mini_em::DARCY; using mini_em::solverType, mini_em::AUGMENTATION, mini_em::MUELU, mini_em::ML, mini_em::CG, mini_em::GMRES; using mini_em::linearAlgebraType, mini_em::linAlgTpetra, mini_em::linAlgEpetra; +bool panzer_impl_old = true; +bool panzer_impl_new = false; + +int panzer_impl_inp = 0; // 0, 1, 2=both + +double timer_MV=0.0; +double timer_ICI=0.0; +bool in_eval_MV = false; +bool in_eval_J = false; +double timer_evalJ=0.0; +double timer_capsg=0.0; + + +template +static T parallel_reduce(Teuchos::RCP > comm, T& localVal, Teuchos::EReductionType red) { + T globalVal; + Teuchos::reduceAll (*comm, red, + localVal, Teuchos::outArg (globalVal)); + return globalVal; +} template int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) @@ -108,9 +133,10 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) if (comm->getSize() > 1) { out->setOutputToRootOnly(0); } - + Teuchos::RCP stacked_timer; - bool use_stacked_timer; + Teuchos::RCP timer; + bool use_stacked_timer, use_timer; std::string test_name = "MiniEM 3D RefMaxwell"; // Figure of merit data for acceptance testing @@ -138,12 +164,14 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) bool resetSolver = false; bool doSolveTimings = false; bool matrixFree = false; + bool use_timer_test = false; int numReps = 0; linearAlgebraType linAlgebraValues[2] = {linAlgTpetra, linAlgEpetra}; const char * linAlgebraNames[2] = {"Tpetra", "Epetra"}; linearAlgebraType linAlgebra = linAlgTpetra; clp.setOption("linAlgebra",&linAlgebra,2,linAlgebraValues,linAlgebraNames); - use_stacked_timer = true; + use_stacked_timer = false; + use_timer = true; print_fom = true; clp.setOption("x-elements",&x_elements); clp.setOption("y-elements",&y_elements); @@ -166,6 +194,8 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) clp.setOption("resetSolver","no-resetSolver",&resetSolver,"update the solver in every timestep"); clp.setOption("doSolveTimings","no-doSolveTimings",&doSolveTimings,"repeat the first solve \"numTimeSteps\" times"); clp.setOption("stacked-timer","no-stacked-timer",&use_stacked_timer,"Run with or without stacked timer output"); + clp.setOption("timer","no-timer",&use_timer,"Run with or without timer output"); + clp.setOption("new-impl",&panzer_impl_inp,"Run without (0) or with (1) new tpetra code, or both old & new (2)"); clp.setOption("test-name", &test_name, "Name of test (for Watchr output)"); clp.setOption("print-fom","no-print-fom",&print_fom,"print the figure of merit for acceptance testing"); #ifdef HAVE_TEUCHOS_STACKTRACE @@ -184,12 +214,31 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: break; } + switch (panzer_impl_inp) { + case 0: + panzer_impl_new = false; + panzer_impl_old = true; + break; + case 1: + panzer_impl_new = true; + panzer_impl_old = false; + break; + case 2: + panzer_impl_new = true; + panzer_impl_old = true; + break; + default: + return EXIT_FAILURE; + } + + std::cout << "P" << comm->getRank() << ": [dbg] panzer_impl_old= " << panzer_impl_old << " panzer_impl_new= " << panzer_impl_new << std::endl; + + #ifdef HAVE_TEUCHOS_STACKTRACE if (stacktrace) Teuchos::print_stack_on_segfault(); #endif - if (use_stacked_timer) { stacked_timer = rcp(new Teuchos::StackedTimer("Mini-EM")); Teuchos::RCP verbose_out = Teuchos::rcp(new Teuchos::FancyOStream(Teuchos::rcpFromRef(std::cout))); @@ -199,6 +248,10 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) Teuchos::TimeMonitor::setStackedTimer(stacked_timer); Teuchos::TimeMonitor tM(*Teuchos::TimeMonitor::getNewTimer(std::string("Mini-EM: Total Time"))); + Teuchos::Time mainTimer("mainTimer", true); + + std::cout << "panzer_impl_new= " << panzer_impl_new << std::endl; + std::cout << "panzer_impl_old= " << panzer_impl_old << std::endl; if (doSolveTimings) { numReps = numTimeSteps; @@ -589,7 +642,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) auxOutArgs.set_W_op(aux_W_op); auxPhysicsME->evalModel(auxInArgs, auxOutArgs); } - + // setup a response library to write to the mesh RCP > stkIOResponseLibrary = buildSTKIOResponseLibrary(physicsBlocks,linObjFactory,wkstContainer,dofManager,cm_factory,mesh, @@ -726,6 +779,26 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) // Collect FOM data before everything goes out of scope fom_num_cells = mesh->getEntityCounts(dim); + + mainTimer.stop(); + if (comm->getRank() == 0) { + std::cout << "mainTimer= " << mainTimer.totalElapsedTime() << std::endl; + } + + if (use_timer) { + if (comm->getRank() == 0) std::cout << "summarize...\n"; + tM.summarize(); + if (comm->getRank() == 0) std::cout << "report...\n"; + auto params = rcp(new Teuchos::ParameterList()); + params->set("Report format", "Table"); // (default), "YAML" + params->set("YAML style", "spacious"); // (default), "compact" + params->set("How to merge timer sets", "Union"); // "Intersection"); // (default), "Union" + params->set("alwaysWriteLocal", true); // , false (default) + params->set("writeGlobalStats", true); // (default), false + params->set("writeZeroTimers", true); // : true (default), false + + tM.report(std::cout, "panzer", params); + } } // Output timer data @@ -733,6 +806,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) stacked_timer->stop("Mini-EM"); Teuchos::StackedTimer::OutputOptions options; options.output_fraction = options.output_histogram = options.output_minmax = true; + // options.output_fraction = options.output_minmax = options.align_columns = true; stacked_timer->report(*out, comm, options); auto xmlOut = stacked_timer->reportWatchrXML(test_name + ' ' + std::to_string(comm->getSize()) + " ranks", comm); if(xmlOut.length()) @@ -756,10 +830,15 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) *out << "=================================\n\n"; } - } else { + } else if (use_timer) { Teuchos::TimeMonitor::summarize(*out,false,true,false,Teuchos::Union,"",true); } - + timer_evalJ = parallel_reduce(comm, timer_evalJ, Teuchos::REDUCE_MAX); + timer_capsg = parallel_reduce(comm, timer_capsg, Teuchos::REDUCE_MAX); + if (!comm->getRank()) { + std::cout << "[dbg] timer_evalJ= " << timer_evalJ << std::endl; + std::cout << "[dbg] timer_capsg= " << timer_capsg << std::endl; + } return EXIT_SUCCESS; } @@ -794,6 +873,7 @@ int main(int argc,char * argv[]){ int retVal; if (linAlgebra == linAlgTpetra) { + // if (useComplex) { // #if defined(HAVE_TPETRA_COMPLEX_DOUBLE) // typedef typename panzer::BlockedTpetraLinearObjFactory,int,panzer::GlobalOrdinal> blockedLinObjFactory; @@ -817,6 +897,7 @@ int main(int argc,char * argv[]){ } else TEUCHOS_ASSERT(false); + Kokkos::finalize(); return retVal; diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml index 5dda86d61a0d..ee60498b2b23 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml @@ -3,7 +3,7 @@ - + diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 17fbcbfb9a5d..3e1164909362 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -41,6 +41,22 @@ #include #include +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 +#if EXP_INCLUDED_FROM_PANXER_MINI_EM +extern bool panzer_impl_new, panzer_impl_old; +extern bool in_eval_J; +extern double timer_evalJ; +extern double timer_capsg; +#else +namespace { +bool panzer_impl_new = true; +bool panzer_impl_old = !panzer_impl_new; +bool in_eval_J = false; +double timer_evalJ=0.0; +double timer_capsg=0.0; +} +#endif + namespace Tpetra { namespace Details { namespace Impl { @@ -964,6 +980,8 @@ namespace Tpetra { CrsGraph:: getLocalNumEntries () const { + Details::ProfilingRegion regionGLNE("Tpetra::CrsGraph::getLocalNumEntries"); + const char tfecfFuncName[] = "getLocalNumEntries: "; typedef LocalOrdinal LO; @@ -1185,7 +1203,6 @@ namespace Tpetra { CrsGraph:: allocateIndices (const ELocalGlobal lg, const bool verbose) { - using Details::ProfilingRegion; using Teuchos::arcp; using Teuchos::Array; using Teuchos::ArrayRCP; @@ -1196,7 +1213,7 @@ namespace Tpetra { const char tfecfFuncName[] = "allocateIndices: "; const char suffix[] = " Please report this bug to the Tpetra developers."; - ProfilingRegion profRegion("Tpetra::CrsGraph::allocateIndices"); + Details::ProfilingRegion profRegion("Tpetra::CrsGraph::allocateIndices"); std::unique_ptr prefix; if (verbose) { @@ -1593,6 +1610,8 @@ namespace Tpetra { typedef GlobalOrdinal GO; const char tfecfFuncName[] = "insertIndices: "; + Details::ProfilingRegion regionII("Tpetra::CrsGraph::insertIndices"); + size_t oldNumEnt = 0; if (debug_) { TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC @@ -1714,6 +1733,8 @@ namespace Tpetra { const char tfecfFuncName[] = "insertGlobalIndicesImpl: "; const LO lclRow = static_cast (rowInfo.localRow); + Details::ProfilingRegion regionIGII("Tpetra::CrsGraph::insertGlobalIndicesImpl"); + auto numEntries = rowInfo.numEntries; using inp_view_type = View; inp_view_type inputInds(inputGblColInds, numInputInds); @@ -1776,6 +1797,8 @@ namespace Tpetra { using LO = LocalOrdinal; const char tfecfFuncName[] = "insertLocallIndicesImpl: "; + Details::ProfilingRegion regionILII("Tpetra::CrsGraph::insertLocallIndicesImpl"); + const RowInfo rowInfo = this->getRowInfo(myRow); size_t numNewInds = 0; @@ -1837,6 +1860,8 @@ namespace Tpetra { using Kokkos::MemoryUnmanaged; auto invalidCount = Teuchos::OrdinalTraits::invalid(); + Details::ProfilingRegion regionFGI("Tpetra::CrsGraph::findGlobalIndices"); + using inp_view_type = View; inp_view_type inputInds(indices.getRawPtr(), indices.size()); @@ -1847,10 +1872,25 @@ namespace Tpetra { if (this->colMap_.is_null()) return invalidCount; const auto& colMap = *(this->colMap_); + auto map = [&](GO const gblInd){return colMap.getLocalElement(gblInd);}; - numFound = Details::findCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), - rowInfo.numEntries, - lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); + + if (panzer_impl_new) { + if (this->isSorted()) { + numFound = Details::findCrsIndicesSorted(lclRow, this->getRowPtrsUnpackedHost(), + rowInfo.numEntries, + lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); + } else { + numFound = Details::findCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), + rowInfo.numEntries, + lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); + } + } + if (panzer_impl_old) { + numFound = Details::findCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), + rowInfo.numEntries, + lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); + } } else if (this->isGloballyIndexed()) { @@ -1861,7 +1901,6 @@ namespace Tpetra { return numFound; } - template size_t CrsGraph:: @@ -2313,6 +2352,8 @@ namespace Tpetra { using Teuchos::ArrayView; const char tfecfFuncName[] = "getGlobalRowCopy: "; + Details::ProfilingRegion regionGGRC("Tpetra::CrsGraph::getGlobalRowCopy"); + // This does the right thing (reports an empty row) if the input // row is invalid. const RowInfo rowinfo = getRowInfoFromGlobalRowIndex (globalRow); @@ -2324,16 +2365,31 @@ namespace Tpetra { numEntries = theNumEntries; // first side effect if (rowinfo.localRow != Teuchos::OrdinalTraits::invalid ()) { + if (isLocallyIndexed ()) { auto lclInds = getLocalIndsViewHost(rowinfo); - for (size_t j = 0; j < theNumEntries; ++j) { - indices[j] = colMap_->getGlobalElement (lclInds(j)); + if (panzer_impl_new) { + bool err = colMap_->getGlobalElements(lclInds.data(), theNumEntries, indices.data()); + if (err) { + std::cout << "[srk] error:" << std::endl; + std::terminate(); + } + } + if (panzer_impl_old) { + for (size_t j = 0; j < theNumEntries; ++j) { + indices[j] = colMap_->getGlobalElement (lclInds(j)); + } } } else if (isGloballyIndexed ()) { auto gblInds = getGlobalIndsViewHost(rowinfo); - for (size_t j = 0; j < theNumEntries; ++j) { - indices[j] = gblInds(j); + if (panzer_impl_new) { + std::memcpy((void*)indices.data(), (const void*) gblInds.data(), theNumEntries*sizeof(*indices.data())); + } + if (panzer_impl_old) { + for (size_t j = 0; j < theNumEntries; ++j) { + indices[j] = gblInds(j); + } } } } @@ -2912,6 +2968,8 @@ namespace Tpetra { using size_type = typename Teuchos::Array::size_type; const char tfecfFuncName[] = "globalAssemble: "; // for exception macro + Details::ProfilingRegion regionGA("Tpetra::CrsGraph::globalAssemble"); + std::unique_ptr prefix; if (verbose_) { prefix = this->createPrefix("CrsGraph", "globalAssemble"); @@ -3163,6 +3221,8 @@ namespace Tpetra { const char tfecfFuncName[] = "fillComplete: "; const bool verbose = verbose_; + Details::ProfilingRegion regionFC("Tpetra::CrsGraph::fillComplete"); + std::unique_ptr prefix; if (verbose) { prefix = this->createPrefix("CrsGraph", "fillComplete"); @@ -3531,6 +3591,8 @@ namespace Tpetra { "expertStaticFillComplete): "; const size_t lclNumRows = this->getLocalNumRows (); + Details::ProfilingRegion regionFLG("Tpetra::CrsGraph::fillLocalGraph"); + // This method's goal is to fill in the two arrays (compressed // sparse row format) that define the sparse graph's structure. @@ -4805,6 +4867,8 @@ namespace Tpetra { const char tfecfFuncName[] = "copyAndPermute: "; const bool verbose = verbose_; + Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermute"); + std::unique_ptr prefix; if (verbose) { prefix = this->createPrefix("CrsGraph", "copyAndPermute"); diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 475d6c9303c5..6e3db9a854c0 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -1,3 +1,8 @@ +/* Notes: + * 1. do we need atomic store in copyAndPermuteNew? Maybe the permute path actually walks on existing data, so maybe we need it. + * 2. cleanup panzer_impl_new/old paths + */ + // @HEADER // ***************************************************************************** // Tpetra: Templated Linear Algebra Services Package @@ -49,11 +54,28 @@ #include "KokkosSparse_spmv.hpp" #include +#include #include #include #include #include +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 +#if EXP_INCLUDED_FROM_PANXER_MINI_EM +extern bool panzer_impl_new, panzer_impl_old; +extern bool in_eval_J; +extern double timer_evalJ; +extern double timer_capsg; +#else +namespace { +bool panzer_impl_new = true; +bool panzer_impl_old = !panzer_impl_new; +bool in_eval_J = false; +double timer_evalJ=0.0; +double timer_capsg=0.0; +} +#endif + namespace Tpetra { namespace { // (anonymous) @@ -2423,15 +2445,115 @@ namespace Tpetra { const impl_scalar_type newVals[], const LocalOrdinal numElts) { - Teuchos::ArrayView indsT(inds, numElts); - auto fun = - [&](size_t const k, size_t const /*start*/, size_t const offset) { - rowVals[offset] = newVals[k]; - }; - std::function cb(std::ref(fun)); - return graph.findGlobalIndices(rowInfo, indsT, cb); + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + + [[maybe_unused]] LocalOrdinal niv=0; + + if (panzer_impl_old) { + Teuchos::ArrayView indsT(inds, numElts); + auto fun = + [&](size_t const k, size_t const /*start*/, size_t const offset) { + rowVals[offset] = newVals[k]; + }; + std::function cb(std::ref(fun)); + niv = graph.findGlobalIndices(rowInfo, indsT, cb); + } + + if (panzer_impl_new) { // new + typedef LocalOrdinal LO; + typedef GlobalOrdinal GO; + + const bool sorted = graph.isSorted (); + const bool atomic = useAtomicUpdatesByDefault; // FIXME + size_t hint = 0; // guess at the index's relative offset in the row + LO numValid = 0; // number of valid input column indices + + if (graph.isLocallyIndexed ()) { + // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its + // pointer does NOT change its reference count. Thus, this + // code is still thread safe. + if (graph.colMap_.is_null ()) { + // NO input column indices are valid in this case, since if + // the column Map is null on the calling process, then the + // calling process owns no graph entries. + return numValid; + } + const map_type& colMap = * (graph.colMap_); + + // Get a view of the column indices in the row. This amortizes + // the cost of getting the view over all the entries of inds. + auto colInds = graph.getLocalIndsViewHost (rowInfo); + if (atomic) { + for (LO j = 0; j < numElts; ++j) { + const LO lclColInd = colMap.getLocalElement (inds[j]); + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + lclColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + Kokkos::atomic_store (&rowVals[offset], newVals[j]); + hint = offset + 1; + numValid++; + } + } + } + } else { + for (LO j = 0; j < numElts; ++j) { + const LO lclColInd = colMap.getLocalElement (inds[j]); + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + lclColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + rowVals[offset]= newVals[j]; + hint = offset + 1; + numValid++; + } + } + } + } + } + else if (graph.isGloballyIndexed ()) { + // Get a view of the column indices in the row. This amortizes + // the cost of getting the view over all the entries of inds. + auto colInds = graph.getGlobalIndsViewHost (rowInfo); + + if (atomic) { + for (LO j = 0; j < numElts; ++j) { + const GO gblColInd = inds[j]; + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + gblColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + Kokkos::atomic_store (&rowVals[offset], newVals[j]); + hint = offset + 1; + numValid++; + } + } + } else { + for (LO j = 0; j < numElts; ++j) { + const GO gblColInd = inds[j]; + const size_t offset = + KokkosSparse::findRelOffset (colInds, rowInfo.numEntries, + gblColInd, hint, sorted); + if (offset != rowInfo.numEntries) { + rowVals[offset] = newVals[j]; + hint = offset + 1; + numValid++; + } + } + } + } + // If the graph is neither locally nor globally indexed on the + // calling process, that means the calling process has no graph + // entries. Thus, none of the input column indices are valid. + return numValid; + } + return LINV; } + template LocalOrdinal CrsMatrix:: @@ -2466,8 +2588,8 @@ namespace Tpetra { return Teuchos::OrdinalTraits::invalid (); } const crs_graph_type& graph = * (this->staticGraph_); - const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow); + if (rowInfo.localRow == Teuchos::OrdinalTraits::invalid ()) { // The input local row is invalid on the calling process, // which means that the calling process summed 0 entries. @@ -2475,9 +2597,12 @@ namespace Tpetra { } auto curRowVals = this->getValuesViewHostNonConst (rowInfo); + const IST* const inVals = reinterpret_cast (inputVals); - return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo, - inputGblColInds, inVals, numEnt); + auto xx = + this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo, + inputGblColInds, inVals, numEnt); + return xx; } template @@ -3235,10 +3360,21 @@ CrsMatrix:: const map_type& colMap = * (staticGraph_->colMap_); auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo); auto curVals = getValuesViewHost(rowinfo); - - for (size_t j = 0; j < theNumEntries; ++j) { - values[j] = curVals[j]; - indices[j] = colMap.getGlobalElement (curLclInds(j)); + if (panzer_impl_old) { + for (size_t j = 0; j < theNumEntries; ++j) { + values[j] = curVals[j]; + auto g = colMap.getGlobalElement (curLclInds(j)); + indices[j] = g; + } + } + if (panzer_impl_new) { + bool err = colMap.getGlobalElements(curLclInds.data(), numEntries, indices.data()); + if (err) { + std::cout << "[srk] error:" << std::endl; + std::terminate(); + } + // FIXME - this should/could be a kokkos deep copy? + std::memcpy((void*)values.data(), (const void*) curVals.data(), numEntries*sizeof(*values.data())); } } else if (staticGraph_->isGloballyIndexed ()) { @@ -3469,7 +3605,7 @@ CrsMatrix:: setAllValues ( const local_matrix_device_type& localDeviceMatrix) { using ProfilingRegion=Details::ProfilingRegion; - ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix"); + ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues1 from KokkosSparse::CrsMatrix"); auto graph = localDeviceMatrix.graph; //FIXME how to check whether graph is allocated @@ -3495,7 +3631,7 @@ CrsMatrix:: typedef impl_scalar_type IST; typedef typename local_graph_device_type::row_map_type row_map_type; //typedef typename row_map_type::non_const_value_type row_offset_type; - const char tfecfFuncName[] = "setAllValues(ArrayRCP, ArrayRCP, ArrayRCP): "; + const char tfecfFuncName[] = "setAllValues2(ArrayRCP, ArrayRCP, ArrayRCP): "; // The row offset type may depend on the execution space. It may // not necessarily be size_t. If it's not, we need to make a deep @@ -5654,6 +5790,16 @@ CrsMatrix:: myGraph_->setRowPtrsUnpacked(row_ptr_beg); } + template + void + copyAndPermuteStaticGraphNew( + const RowMatrix& srcMat, + RowMatrix& tgtMat, + const size_t numSameIDs, + const LocalOrdinal permuteToLIDs[], + const LocalOrdinal permuteFromLIDs[], + const size_t numPermutes); + template void CrsMatrix:: @@ -5664,6 +5810,7 @@ CrsMatrix:: const LocalOrdinal permuteFromLIDs[], const size_t numPermutes) { + //CTL_TRACE("copyAndPermuteStaticGraph_1"); using Details::ProfilingRegion; using Teuchos::Array; using Teuchos::ArrayView; @@ -5693,12 +5840,14 @@ CrsMatrix:: // Copy the first numSame row from source to target (this matrix). // This involves copying rows corresponding to LIDs [0, numSame-1]. // - const map_type& srcRowMap = * (srcMat.getRowMap ()); + const auto& srcRowMap = * (srcMat.getRowMap ()); nonconst_global_inds_host_view_type rowInds; nonconst_values_host_view_type rowVals; const LO numSameIDs_as_LID = static_cast (numSameIDs); - if (targetIsLocallyIndexed && sourceIsLocallyIndexed) { + // FIXME - need to examine this path + if (0 && targetIsLocallyIndexed && sourceIsLocallyIndexed) { + // Create a mapping from the source's local column id's to my local column ids using DT = typename Node::device_type; const map_type& src_col_map = *(srcMat.getColMap()); @@ -5708,17 +5857,19 @@ CrsMatrix:: auto local_tgt_col_map = tgt_col_map.getLocalMap(); auto invalid = Teuchos::OrdinalTraits::invalid(); - auto num_src_cols = src_col_map.getLocalNumElements(); + LO num_src_cols = static_cast(src_col_map.getLocalNumElements()); Kokkos::UnorderedMap lid_map(num_src_cols); - for (int src_local_col_idx=0; src_local_col_idx:: Kokkos::View indices("tgt_local_cols", src_local_cols.extent(0)); Kokkos::View values("tgt_local_vals", src_local_cols.extent(0)); - int idx = 0; - for (int offset=0; offset:: idx += 1; } } - auto inds = Kokkos::subview(indices, Kokkos::make_pair(0, idx)); - auto vals = Kokkos::subview(values, Kokkos::make_pair(0, idx)); + auto inds = Kokkos::subview(indices, Kokkos::make_pair(size_t(0), idx)); + auto vals = Kokkos::subview(values, Kokkos::make_pair(size_t(0), idx)); this->replaceLocalValues(local_row, inds, vals); } } else if (sourceIsLocallyIndexed) { @@ -5750,28 +5901,96 @@ CrsMatrix:: // Global ID for the current row index in the source matrix. // The first numSameIDs GIDs in the two input lists are the // same, so sourceGID == targetGID in this case. + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); const GO targetGID = sourceGID; - ArrayViewrowIndsConstView; + ArrayView rowIndsConstView; ArrayView rowValsConstView; const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); + if (rowLength > static_cast (rowInds.size())) { Kokkos::resize(rowInds,rowLength); Kokkos::resize(rowVals,rowLength); } // Resizing invalidates an Array's views, so we must make new // ones, even if rowLength hasn't changed. - nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + + nonconst_global_inds_host_view_type rowIndsView; + nonconst_values_host_view_type rowValsView; // The source matrix is locally indexed, so we have to get a // copy. Really it's the GIDs that have to be copied (because // they have to be converted from LIDs). size_t checkRowLength = 0; - srcMat.getGlobalRowCopy (sourceGID, rowIndsView, - rowValsView, checkRowLength); + + if (panzer_impl_old) + { + rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + rowIndsConstView = Teuchos::ArrayView (rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + + rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + rowValsConstView = Teuchos::ArrayView (reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + + srcMat.getGlobalRowCopy (sourceGID, rowIndsView, + rowValsView, checkRowLength); + } + if (panzer_impl_new) { + using crs_matrix_type = CrsMatrix; + const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); + if (!srcMatCrsPtr) { + std::cout << "srk error srcMat type= " << typeid(srcMat).name() << std::endl; + std::terminate(); + } + const crs_matrix_type& srcMatCrs = *srcMatCrsPtr; + + auto globalRow = sourceGID; + auto StaticGraphRCP = srcMatCrs.getGraph(); + const crs_graph_type *StaticGraphPtr = dynamic_cast(StaticGraphRCP.get()); + if (!StaticGraphPtr) { + std::cout << "srk error StaticGraphPtr type= " << typeid(*StaticGraphRCP.get()).name() << std::endl; + std::terminate(); + } + const crs_graph_type& StaticGraph = *StaticGraphPtr; + const RowInfo rowinfo = StaticGraph.getRowInfoFromGlobalRowIndex (globalRow); + const size_t theNumEntries = rowinfo.numEntries; + // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + // static_cast (indices.size ()) < theNumEntries || + // static_cast (values.size ()) < theNumEntries, + // std::runtime_error, "Row with global index " << globalRow << " has " + // << theNumEntries << " entry/ies, but indices.size() = " << + // indices.size () << " and values.size() = " << values.size () << "."); + checkRowLength = theNumEntries; // first side effect + auto numEntries = theNumEntries; + + if (rowinfo.localRow != Teuchos::OrdinalTraits::invalid ()) { + if (StaticGraph.isLocallyIndexed ()) { + const map_type& colMap = * (StaticGraph.getColMap()); + auto curLclInds = StaticGraph.getLocalIndsViewHost(rowinfo); + auto rowValsViewLocal = srcMatCrs.getValuesViewHost(rowinfo); + rowValsConstView = Teuchos::ArrayView (reinterpret_cast(rowValsViewLocal.data()), rowValsViewLocal.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + auto rowIndsViewLocal = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + rowIndsConstView = Teuchos::ArrayView (rowIndsViewLocal.data(), rowIndsViewLocal.extent(0), Teuchos::RCP_DISABLE_NODE_LOOKUP); + bool err = colMap.getGlobalElements(curLclInds.data(), numEntries, rowIndsViewLocal.data()); + if (err) { + std::cout << "[srk] error:" << std::endl; + std::terminate(); + } + } + else if (StaticGraph.isGloballyIndexed ()) { + auto rowIndsViewLocal = StaticGraph.getGlobalIndsViewHost(rowinfo); + rowIndsConstView = Teuchos::ArrayView (rowIndsViewLocal.data(), rowIndsViewLocal.extent(0), Teuchos::RCP_DISABLE_NODE_LOOKUP); + auto rowValsViewLocal = srcMatCrs.getValuesViewHost(rowinfo); + rowValsConstView = Teuchos::ArrayView (reinterpret_cast(rowValsViewLocal.data()), rowValsViewLocal.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + } + } + } + if (debug) { TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC (rowLength != checkRowLength, std::logic_error, "For " @@ -5781,24 +6000,10 @@ CrsMatrix:: "a row length of " << checkRowLength << "." << suffix); } - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface - // Applying a permutation to a matrix with a static graph - // means REPLACE-ing entries. combineGlobalValues(targetGID, rowIndsConstView, rowValsConstView, REPLACE, prefix_raw, debug, verbose); - } + } // for (sourceLID... } else { for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { // Global ID for the current row index in the source matrix. @@ -5839,6 +6044,10 @@ CrsMatrix:: os << *prefix << "Do permutes" << endl; } + // + // "Permute" part of "copy and permute." + // + const map_type& tgtRowMap = * (this->getRowMap ()); for (size_t p = 0; p < numPermutes; ++p) { const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]); @@ -5925,6 +6134,7 @@ CrsMatrix:: const Kokkos::DualView& permuteFromLIDs_dv, const size_t numPermutes) { + //CTL_TRACE("copyAndPermuteStaticGraph_2"); using Details::ProfilingRegion; using Teuchos::Array; using Teuchos::ArrayView; @@ -6113,6 +6323,7 @@ CrsMatrix:: const Kokkos::DualView& permuteFromLIDs, const CombineMode /*CM*/) { + //CTL_TRACE("copyAndPermute"); using Details::Behavior; using Details::dualViewStatusToString; using Details::ProfilingRegion; @@ -6155,15 +6366,40 @@ CrsMatrix:: using RMT = RowMatrix; const RMT& srcMat = dynamic_cast (srcObj); if (isStaticGraph ()) { - TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () ); - auto permuteToLIDs_h = permuteToLIDs.view_host (); - TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () ); - auto permuteFromLIDs_h = permuteFromLIDs.view_host (); + if (panzer_impl_new) { + double time_ = Teuchos::Time::wallTime(); + TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () ); + auto permuteToLIDs_d = permuteToLIDs.view_device (); + TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () ); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + + copyAndPermuteStaticGraphNew(srcMat, *this, + numSameIDs, + permuteToLIDs_d.data(), + permuteFromLIDs_d.data(), + numPermute); + if (in_eval_J) { + timer_capsg += -time_ + Teuchos::Time::wallTime(); + } + + + } + if (panzer_impl_old) { + double time_ = Teuchos::Time::wallTime(); + TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () ); + auto permuteToLIDs_h = permuteToLIDs.view_host (); + TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () ); + auto permuteFromLIDs_h = permuteFromLIDs.view_host (); + + copyAndPermuteStaticGraph(srcMat, numSameIDs, + permuteToLIDs_h.data(), + permuteFromLIDs_h.data(), + numPermute); + if (in_eval_J) { + timer_capsg += -time_ + Teuchos::Time::wallTime(); + } - copyAndPermuteStaticGraph(srcMat, numSameIDs, - permuteToLIDs_h.data(), - permuteFromLIDs_h.data(), - numPermute); + } } else { copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs, @@ -6882,17 +7118,17 @@ CrsMatrix:: const bool verbose) { const char tfecfFuncName[] = "combineGlobalValues: "; - - if (isStaticGraph ()) { + const bool isg = isStaticGraph (); + if (isg) { // INSERT doesn't make sense for a static graph, since you // aren't allowed to change the structure of the graph. // However, all the other combine modes work. - if (combineMode == ADD) { - sumIntoGlobalValues (globalRowIndex, columnIndices, values); - } - else if (combineMode == REPLACE) { + if (combineMode == REPLACE) { replaceGlobalValues (globalRowIndex, columnIndices, values); } + else if (combineMode == ADD) { + sumIntoGlobalValues (globalRowIndex, columnIndices, values); + } else if (combineMode == ABSMAX) { using ::Tpetra::Details::AbsMax; AbsMax f; @@ -8905,6 +9141,7 @@ CrsMatrix:: #ifdef HAVE_TPETRA_MMM_TIMINGS Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries"))); #endif + Import_Util::sortAndMergeCrsEntries (CSR_rowptr_d, CSR_colind_LID_d, CSR_vals_d); @@ -9256,6 +9493,282 @@ CrsMatrix:: transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params); } + // FIXME - I put this at the end of the file to help with diffing code, it should be moved up/replace the old copyAndPermuteStaticGraph + template + void + copyAndPermuteStaticGraphNew( + const RowMatrix& srcMat, + RowMatrix& tgtMat, + const size_t numSameIDs, + const LocalOrdinal permuteToLIDs[], + const LocalOrdinal permuteFromLIDs[], + const size_t numPermutes) + { + using Details::ProfilingRegion; + using Teuchos::Array; + //using Teuchos::ArrayView; + using std::endl; + using LO = LocalOrdinal; + using GO = GlobalOrdinal; + + using impl_scalar_type = typename Kokkos::ArithTraits::val_type; + + using crs_matrix_type = CrsMatrix; + + typedef typename crs_matrix_type::local_inds_device_view_type::non_const_value_type local_inds_device_value_t; + typedef typename crs_matrix_type::row_ptrs_device_view_type::non_const_value_type row_ptrs_device_value_t; + typedef typename crs_matrix_type::local_matrix_device_type k_local_matrix_device_type; + + typedef typename Node::execution_space exec_space; + typedef Kokkos::RangePolicy range_type; + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + + const char tfecfFuncName[] = "copyAndPermuteStaticGraphNew"; + ProfilingRegion regionCAP + ("Tpetra::CrsMatrix::copyAndPermuteStaticGraphNew"); + + // const bool debug = Details::Behavior::debug("CrsGraph"); + // const bool verbose = Details::Behavior::verbose("CrsGraph"); + + const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); + if (!srcMatCrsPtr) { + std::cout << "srk error srcMat type= " << typeid(srcMat).name() << std::endl; + std::terminate(); + } + const crs_matrix_type& srcMatCrs = *srcMatCrsPtr; + + crs_matrix_type *tgtMatCrsPtr = dynamic_cast(&tgtMat); + if (!tgtMatCrsPtr) { + std::cout << "srk error tgtMat type= " << typeid(tgtMat).name() << std::endl; + std::terminate(); + } + crs_matrix_type& tgtMatCrs = *tgtMatCrsPtr; + + std::string prefix = tfecfFuncName; + // const char* const prefix_raw = prefix.c_str(); + + const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); + // + // Copy the first numSame row from source to target (this matrix). + // This involves copying rows corresponding to LIDs [0, numSame-1]. + // + const auto& srcRowMap = * (srcMat.getRowMap ()); + auto comm = srcRowMap.getComm(); + + const LO numSameIDs_as_LID = static_cast (numSameIDs); + + auto my_replaceGlobalValuesImpl_scalar + = KOKKOS_LAMBDA( + const bool sorted, const bool atomic, size_t hint[], + const size_t numInTgtRow, const local_inds_device_value_t tgtColInds[], impl_scalar_type tgtRowVals[], + const local_inds_device_value_t lclColInd, const impl_scalar_type newVals + ) -> LO + { + LO numValid = 0; // number of valid input column indices + + if (atomic) { + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint[0], sorted); + if (offset != numInTgtRow) { + Kokkos::atomic_store (&tgtRowVals[offset], newVals); + hint[0] = offset + 1; + numValid++; + } + } + } else { + if (lclColInd != LINV) { + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint[0], sorted); + if (offset != numInTgtRow) { + tgtRowVals[offset] = newVals; + hint[0] = offset + 1; + numValid++; + } + } + } + return numValid; + }; + + if (sourceIsLocallyIndexed) { + + const k_local_matrix_device_type & srcMatDevice = srcMatCrs.getLocalMatrixDevice(); + const k_local_matrix_device_type & tgtMatDevice = tgtMatCrs.getLocalMatrixDevice(); + + typename crs_matrix_type::row_ptrs_device_view_type tgtLocalRowPtrsDevice = tgtMatCrs.getLocalRowPtrsDevice(); + typename crs_matrix_type::local_inds_device_view_type tgtLocalColIndsDevice = tgtMatCrs.getLocalIndicesDevice(); + typename crs_matrix_type::row_ptrs_host_view_type srcLocalRowPtrsHost = srcMatCrs.getLocalRowPtrsHost(); + typename crs_matrix_type::row_ptrs_device_view_type srcLocalRowPtrsDevice = srcMatCrs.getLocalRowPtrsDevice(); + typename crs_matrix_type::local_inds_device_view_type srcLocalColIndsDevice = srcMatCrs.getLocalIndicesDevice(); + + row_ptrs_device_value_t max_row_entries = 0; + + max_row_entries=0; + + // FIXME - is there a more concise kokkos way to do this? + Kokkos::parallel_reduce ("Tpetra_CrsMatrix_capsg_get_max_nc", range_type (0, numSameIDs_as_LID), + KOKKOS_LAMBDA(const LO sourceLID, row_ptrs_device_value_t& gmax) { + row_ptrs_device_value_t start = srcLocalRowPtrsDevice(sourceLID); + row_ptrs_device_value_t end = srcLocalRowPtrsDevice(sourceLID+1); + row_ptrs_device_value_t ct = (end - start); + if (ct > gmax) { + gmax = ct; + } + }, + Kokkos::Max(max_row_entries)); + + bool tgtMatIsSorted = tgtMatCrs.getCrsGraph()->isSorted(); + + using local_map_type = typename crs_matrix_type::map_type::local_map_type; + + local_map_type local_map = srcMat.getRowMap()->getLocalMap(); + local_map_type local_col_map = srcMat.getColMap()->getLocalMap(); + local_map_type tgt_local_map = tgtMatCrs.getRowMap()->getLocalMap(); + local_map_type tgt_local_col_map = tgtMatCrs.getColMap()->getLocalMap(); + + auto vals = srcMatCrs.getLocalValuesDevice (Access::ReadOnly); + auto tvals = tgtMatCrs.getLocalValuesDevice (Access::ReadWrite); + + Kokkos::parallel_for + ("Tpetra_CrsMatrix::copyAndPermuteStaticGraph", + range_type (0, numSameIDs_as_LID), + KOKKOS_LAMBDA(const LO sourceLID) + { + local_inds_device_value_t start = srcLocalRowPtrsDevice(sourceLID); + local_inds_device_value_t end = srcLocalRowPtrsDevice(sourceLID+1); + local_inds_device_value_t rowLength = (end - start); + + KOKKOS_ASSERT(rowLength <= max_row_entries); + + local_inds_device_value_t tstart = tgtLocalRowPtrsDevice(sourceLID); + local_inds_device_value_t tend = tgtLocalRowPtrsDevice(sourceLID + 1); + local_inds_device_value_t numInTgtRow = (tend - tstart); + + KOKKOS_ASSERT(tstart < tvals.extent(0)); + Scalar *tgtRowVals = &tvals(tstart); + const local_inds_device_value_t *tgtColInds = &tgtLocalColIndsDevice(tstart); + + size_t hint=0; + for (LO j = 0; j < rowLength; j++) { + local_inds_device_value_t ci = srcLocalColIndsDevice(start + j); + GO gi = local_col_map.getGlobalElement(ci); + const local_inds_device_value_t lclColInd = tgt_local_col_map.getLocalElement(gi); + my_replaceGlobalValuesImpl_scalar(tgtMatIsSorted, false, &hint, + numInTgtRow, tgtColInds, tgtRowVals, + lclColInd, vals(start+j)); + } + + }); // kokkos parallel_for + + } else { + for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { + // Global ID for the current row index in the source matrix. + // The first numSameIDs GIDs in the two input lists are the + // same, so sourceGID == targetGID in this case. + const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); + const GO targetGID = sourceGID; + + Teuchos::ArrayView rowIndsConstView; + Teuchos::ArrayView rowValsConstView; + + typename crs_matrix_type::global_inds_host_view_type rowIndsView; + typename crs_matrix_type::values_host_view_type rowValsView; + srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface + + // Applying a permutation to a matrix with a static graph + // means REPLACE-ing entries. + // FIXME - need to apply the same approach as above, maybe reuse my_replaceGlobalValuesImpl_scalar? + tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, + rowValsConstView); + } + } + + // FIXME - need to apply the same approach as above to the permutes + + // + // "Permute" part of "copy and permute." + // + typename crs_matrix_type::nonconst_global_inds_host_view_type rowInds; + typename crs_matrix_type::nonconst_values_host_view_type rowVals; + + const auto& tgtRowMap = * (tgtMat.getRowMap ()); + for (size_t p = 0; p < numPermutes; ++p) { + const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]); + const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]); + + Teuchos::ArrayView rowIndsConstView; + Teuchos::ArrayView rowValsConstView; + + if (sourceIsLocallyIndexed) { + const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); + if (rowLength > static_cast (rowInds.size ())) { + Kokkos::resize(rowInds,rowLength); + Kokkos::resize(rowVals,rowLength); + } + // Resizing invalidates an Array's views, so we must make new + // ones, even if rowLength hasn't changed. + typename crs_matrix_type::nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); + typename crs_matrix_type::nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); + + // The source matrix is locally indexed, so we have to get a + // copy. Really it's the GIDs that have to be copied (because + // they have to be converted from LIDs). + size_t checkRowLength = 0; + srcMat.getGlobalRowCopy(sourceGID, rowIndsView, + rowValsView, checkRowLength); + + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface + } + else { + typename crs_matrix_type::global_inds_host_view_type rowIndsView; + typename crs_matrix_type::values_host_view_type rowValsView; + srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); + // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take + // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView + // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews + // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews + rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD + rowIndsView.data(), rowIndsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD + reinterpret_cast(rowValsView.data()), rowValsView.extent(0), + Teuchos::RCP_DISABLE_NODE_LOOKUP); + // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with + // KDDKDD UVM TEMPORARY: KokkosView interface + } + + tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, + rowValsConstView); + } + + } + } // namespace Tpetra // diff --git a/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp b/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp index cc17543ec7c1..030b5555f756 100644 --- a/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp @@ -624,7 +624,21 @@ class WrappedDualView { // We check to see if the memory is not aliased *or* if it is a supported // (heterogeneous memory) accelerator (for shared host/device memory). - return !memoryIsAliased() || Spaces::is_gpu_exec_space(); + //return !memoryIsAliased() || Spaces::is_gpu_exec_space(); + if constexpr(Spaces::is_gpu_exec_space()) { + return true; + } else { + //return !memoryIsAliased(); + // { return deviceMemoryIsHostAccessible && dualView.h_view.data() == dualView.d_view.data() } + + // return !(deviceMemoryIsHostAccessible && dualView.h_view.data() == dualView.d_view.data()); + // return !deviceMemoryIsHostAccessible || dualView.h_view.data() != dualView.d_view.data(); + if constexpr(!deviceMemoryIsHostAccessible) { + return true; + } else { + return dualView.h_view.data() != dualView.d_view.data(); + } + } } diff --git a/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp b/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp index 4496f17eb12a..213c9c705400 100644 --- a/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_crsUtils.hpp @@ -20,6 +20,10 @@ #include #include #include +#include + +extern bool in_eval_MV; +extern double timer_ICI; /// \file Tpetra_Details_crsUtils.hpp /// \brief Functions for manipulating CRS arrays @@ -399,6 +403,7 @@ insert_crs_indices( using offset_type = typename std::decay::type; using ordinal_type = typename std::decay::type; + auto invalid_ordinal = Teuchos::OrdinalTraits::invalid(); const offset_type start = row_ptrs[row]; offset_type end = start + static_cast (num_assigned); @@ -422,8 +427,8 @@ insert_crs_indices( if (idx == cur_indices[row_offset]) { break; } - } - + } + if (row_offset == end) { if (num_inserted >= num_avail) { // not enough room return Teuchos::OrdinalTraits::invalid(); @@ -499,22 +504,67 @@ find_crs_indices( size_t num_found = 0; for (size_t k = 0; k < new_indices.size(); k++) { - auto row_offset = start; auto idx = std::forward(map)(new_indices[k]); if (idx == invalid_ordinal) continue; - for (; row_offset < end; row_offset++) + for (size_t row_offset = start; row_offset < end; row_offset++) { - if (idx == cur_indices[row_offset]) + size_t off = row_offset - start; + auto lidx = cur_indices[row_offset]; + if (idx == lidx) { - std::forward(cb)(k, start, row_offset - start); + std::forward(cb)(k, start, off); num_found++; + // FIXME why no break here, can an index be found twice? + // break; } } } return num_found; } +/// \brief Implementation of findCrsIndices +template +size_t +find_crs_indices_sorted( + typename Pointers::value_type const row, + Pointers const& row_ptrs, + const size_t curNumEntries, + Indices1 const& cur_indices, + Indices2 const& new_indices, + IndexMap&& map, + Callback&& cb) +{ + if (new_indices.size() == 0) + return 0; + + using ordinal = + typename std::remove_const::type; + auto invalid_ordinal = Teuchos::OrdinalTraits::invalid(); + + const size_t start = static_cast (row_ptrs[row]); + const size_t end = start + curNumEntries; + size_t num_found = 0; + for (size_t k = 0; k < new_indices.size(); k++) + { + auto idx = std::forward(map)(new_indices[k]); + if (idx == invalid_ordinal) + continue; + + // FIXME srk use kokkos findRelOffset + auto first = &cur_indices[start]; + auto first0 = first; + auto last = &cur_indices[end]; + first = std::lower_bound(first, last, idx); + size_t off = first - first0; + if (first != last && !(idx < *first)) { + std::forward(cb)(k, start, off); + num_found++; + } + } + return num_found; +} + } // namespace impl @@ -718,6 +768,20 @@ findCrsIndices( return impl::find_crs_indices(row, rowPtrs, curNumEntries, curIndices, newIndices, map, cb); } +template +size_t +findCrsIndicesSorted( + typename Pointers::value_type const row, + Pointers const& rowPtrs, + const size_t curNumEntries, + Indices1 const& curIndices, + Indices2 const& newIndices, + IndexMap&& map, + Callback&& cb) +{ + return impl::find_crs_indices_sorted(row, rowPtrs, curNumEntries, curIndices, newIndices, map, cb); +} + } // namespace Details } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_Map_decl.hpp b/packages/tpetra/core/src/Tpetra_Map_decl.hpp index 8f97f7b1d71f..de5b8c1cca7b 100644 --- a/packages/tpetra/core/src/Tpetra_Map_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Map_decl.hpp @@ -642,6 +642,7 @@ namespace Tpetra { /// the same value as /// Teuchos::OrdinalTraits::invalid(). global_ordinal_type getGlobalElement (local_ordinal_type localIndex) const; + bool getGlobalElements (const local_ordinal_type localIndices[], size_t numEntries, global_ordinal_type globalIndices[]) const; /// \brief Get the local Map for Kokkos kernels. /// diff --git a/packages/tpetra/core/src/Tpetra_Map_def.hpp b/packages/tpetra/core/src/Tpetra_Map_def.hpp index c6b028e8f616..f697ab3aa8bb 100644 --- a/packages/tpetra/core/src/Tpetra_Map_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Map_def.hpp @@ -1066,7 +1066,7 @@ namespace Tpetra { // beginning of the range starts with the first entry. While // doing so, fill in the LID -> GID table. typename decltype (lgMap_)::non_const_type lgMap - (view_alloc ("lgMap", WithoutInitializing), numLocalElements_); + (view_alloc ("lgMap2", WithoutInitializing), numLocalElements_); // Because you can't use lambdas in constructors on CUDA. Or using private/protected data. // DEEP_COPY REVIEW - DEVICE-TO-DEVICE @@ -1274,6 +1274,40 @@ namespace Tpetra { } } + template + bool + Map:: + getGlobalElements (const local_ordinal_type localIndices[], size_t numEntries, global_ordinal_type globalIndices[]) const + { + auto const minGI = getMinGlobalIndex(); + auto const minLI = getMinLocalIndex(); + auto const maxLI = getMaxLocalIndex(); + if (isContiguous ()) { + for (size_t i = 0; i < numEntries; i++) { + auto lclInd = localIndices[i]; + if (lclInd < minLI || lclInd > maxLI) { + return true; + } + globalIndices[i] = minGI + lclInd; + } + } + else { + // This is a host Kokkos::View access, with no RCP or ArrayRCP + // involvement. As a result, it is thread safe. + // + // lgMapHost_ is a host pointer; this does NOT assume UVM. + lazyPushToHost(); + for (size_t i = 0; i < numEntries; i++) { + auto lclInd = localIndices[i]; + if (lclInd < minLI || lclInd > maxLI) { + return true; + } + globalIndices[i] = lgMapHost_[lclInd]; + } + } + return false; + } + template bool Map:: @@ -1662,7 +1696,7 @@ namespace Tpetra { using Kokkos::view_alloc; using Kokkos::WithoutInitializing; - lg_view_type lgMap ("lgMap", numElts); + lg_view_type lgMap ("lgMap3", numElts); if (verbose) { std::ostringstream os; os << *prefix << "Fill lgMap" << endl; @@ -1749,7 +1783,7 @@ namespace Tpetra { using Kokkos::view_alloc; using Kokkos::WithoutInitializing; - lg_view_type lgMap ("lgMap", numElts); + lg_view_type lgMap ("lgMap4", numElts); if (verbose) { std::ostringstream os; os << *prefix << "Fill lgMap" << endl; From e4a5255d8e8732431f2b59f1416558598cce27c9 Mon Sep 17 00:00:00 2001 From: srkenno Date: Wed, 13 Nov 2024 13:42:45 -0700 Subject: [PATCH 04/23] initial push; see branch srkenno/copy-and-permute-improvements --- .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 25 +- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 35 +- .../src/copyAndPermuteStaticGraph_new.hpp | 431 ------------------ 3 files changed, 28 insertions(+), 463 deletions(-) delete mode 100644 packages/tpetra/core/src/copyAndPermuteStaticGraph_new.hpp diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 3e1164909362..d6c4b2227750 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -41,20 +41,17 @@ #include #include -#define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 #if EXP_INCLUDED_FROM_PANXER_MINI_EM extern bool panzer_impl_new, panzer_impl_old; extern bool in_eval_J; extern double timer_evalJ; extern double timer_capsg; +#define PANZER_IMPL_NEW panzer_impl_new +#define PANZER_IMPL_OLD panzer_impl_old #else -namespace { -bool panzer_impl_new = true; -bool panzer_impl_old = !panzer_impl_new; -bool in_eval_J = false; -double timer_evalJ=0.0; -double timer_capsg=0.0; -} +#define PANZER_IMPL_NEW true +#define PANZER_IMPL_OLD false #endif namespace Tpetra { @@ -1875,7 +1872,7 @@ namespace Tpetra { auto map = [&](GO const gblInd){return colMap.getLocalElement(gblInd);}; - if (panzer_impl_new) { + if (PANZER_IMPL_NEW) { if (this->isSorted()) { numFound = Details::findCrsIndicesSorted(lclRow, this->getRowPtrsUnpackedHost(), rowInfo.numEntries, @@ -1886,7 +1883,7 @@ namespace Tpetra { lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); } } - if (panzer_impl_old) { + if (PANZER_IMPL_OLD) { numFound = Details::findCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), rowInfo.numEntries, lclIndsUnpacked_wdv.getHostView(Access::ReadOnly), inputInds, map, fun); @@ -2368,14 +2365,14 @@ namespace Tpetra { if (isLocallyIndexed ()) { auto lclInds = getLocalIndsViewHost(rowinfo); - if (panzer_impl_new) { + if (PANZER_IMPL_NEW) { bool err = colMap_->getGlobalElements(lclInds.data(), theNumEntries, indices.data()); if (err) { std::cout << "[srk] error:" << std::endl; std::terminate(); } } - if (panzer_impl_old) { + if (PANZER_IMPL_OLD) { for (size_t j = 0; j < theNumEntries; ++j) { indices[j] = colMap_->getGlobalElement (lclInds(j)); } @@ -2383,10 +2380,10 @@ namespace Tpetra { } else if (isGloballyIndexed ()) { auto gblInds = getGlobalIndsViewHost(rowinfo); - if (panzer_impl_new) { + if (PANZER_IMPL_NEW) { std::memcpy((void*)indices.data(), (const void*) gblInds.data(), theNumEntries*sizeof(*indices.data())); } - if (panzer_impl_old) { + if (PANZER_IMPL_OLD) { for (size_t j = 0; j < theNumEntries; ++j) { indices[j] = gblInds(j); } diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 6e3db9a854c0..0a3dc297fde0 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -60,20 +60,17 @@ #include #include -#define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 #if EXP_INCLUDED_FROM_PANXER_MINI_EM extern bool panzer_impl_new, panzer_impl_old; extern bool in_eval_J; extern double timer_evalJ; extern double timer_capsg; +#define PANZER_IMPL_NEW panzer_impl_new +#define PANZER_IMPL_OLD panzer_impl_old #else -namespace { -bool panzer_impl_new = true; -bool panzer_impl_old = !panzer_impl_new; -bool in_eval_J = false; -double timer_evalJ=0.0; -double timer_capsg=0.0; -} +#define PANZER_IMPL_NEW true +#define PANZER_IMPL_OLD false #endif namespace Tpetra { @@ -2450,7 +2447,7 @@ namespace Tpetra { [[maybe_unused]] LocalOrdinal niv=0; - if (panzer_impl_old) { + if (PANZER_IMPL_OLD) { Teuchos::ArrayView indsT(inds, numElts); auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) { @@ -2460,7 +2457,7 @@ namespace Tpetra { niv = graph.findGlobalIndices(rowInfo, indsT, cb); } - if (panzer_impl_new) { // new + if (PANZER_IMPL_NEW) { // new typedef LocalOrdinal LO; typedef GlobalOrdinal GO; @@ -3360,14 +3357,14 @@ CrsMatrix:: const map_type& colMap = * (staticGraph_->colMap_); auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo); auto curVals = getValuesViewHost(rowinfo); - if (panzer_impl_old) { + if (PANZER_IMPL_OLD) { for (size_t j = 0; j < theNumEntries; ++j) { values[j] = curVals[j]; auto g = colMap.getGlobalElement (curLclInds(j)); indices[j] = g; } } - if (panzer_impl_new) { + if (PANZER_IMPL_NEW) { bool err = colMap.getGlobalElements(curLclInds.data(), numEntries, indices.data()); if (err) { std::cout << "[srk] error:" << std::endl; @@ -5925,7 +5922,7 @@ CrsMatrix:: // they have to be converted from LIDs). size_t checkRowLength = 0; - if (panzer_impl_old) + if (PANZER_IMPL_OLD) { rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); rowIndsConstView = Teuchos::ArrayView (rowIndsView.data(), rowIndsView.extent(0), @@ -5938,7 +5935,7 @@ CrsMatrix:: srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView, checkRowLength); } - if (panzer_impl_new) { + if (PANZER_IMPL_NEW) { using crs_matrix_type = CrsMatrix; const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); if (!srcMatCrsPtr) { @@ -6366,7 +6363,7 @@ CrsMatrix:: using RMT = RowMatrix; const RMT& srcMat = dynamic_cast (srcObj); if (isStaticGraph ()) { - if (panzer_impl_new) { + if (PANZER_IMPL_NEW) { double time_ = Teuchos::Time::wallTime(); TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () ); auto permuteToLIDs_d = permuteToLIDs.view_device (); @@ -6378,13 +6375,14 @@ CrsMatrix:: permuteToLIDs_d.data(), permuteFromLIDs_d.data(), numPermute); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM if (in_eval_J) { timer_capsg += -time_ + Teuchos::Time::wallTime(); } - +#endif } - if (panzer_impl_old) { + if (PANZER_IMPL_OLD) { double time_ = Teuchos::Time::wallTime(); TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () ); auto permuteToLIDs_h = permuteToLIDs.view_host (); @@ -6395,10 +6393,11 @@ CrsMatrix:: permuteToLIDs_h.data(), permuteFromLIDs_h.data(), numPermute); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM if (in_eval_J) { timer_capsg += -time_ + Teuchos::Time::wallTime(); } - +#endif } } else { diff --git a/packages/tpetra/core/src/copyAndPermuteStaticGraph_new.hpp b/packages/tpetra/core/src/copyAndPermuteStaticGraph_new.hpp deleted file mode 100644 index 2f619e6dd8c2..000000000000 --- a/packages/tpetra/core/src/copyAndPermuteStaticGraph_new.hpp +++ /dev/null @@ -1,431 +0,0 @@ - // not yet a member function of CrsMatrix - template - void - copyAndPermuteStaticGraph_new( - const RowMatrix& srcMat, - RowMatrix& tgtMat, - const size_t numSameIDs, - const LocalOrdinal permuteToLIDs[], - const LocalOrdinal permuteFromLIDs[], - const size_t numPermutes) - { - using Details::ProfilingRegion; - using Teuchos::Array; - //using Teuchos::ArrayView; - using std::endl; - using LO = LocalOrdinal; - using GO = GlobalOrdinal; - - using impl_scalar_type = typename Kokkos::ArithTraits::val_type; - typedef typename Kokkos::View::non_const_type nonconst_values_device_view_type; - typedef typename Kokkos::View::non_const_type nonconst_global_inds_device_view_type; - - const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); - - const char tfecfFuncName[] = "copyAndPermuteStaticGraph"; - ProfilingRegion regionCAP - ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph"); - - // const bool debug = Details::Behavior::debug("CrsGraph"); - // const bool verbose = Details::Behavior::verbose("CrsGraph"); - - using crs_matrix_type = CrsMatrix; - - const crs_matrix_type *srcMatCrsPtr = dynamic_cast(&srcMat); - if (!srcMatCrsPtr) { - std::cout << "srk error srcMat type= " << typeid(srcMat).name() << std::endl; - std::terminate(); - } - const crs_matrix_type& srcMatCrs = *srcMatCrsPtr; - - crs_matrix_type *tgtMatCrsPtr = dynamic_cast(&tgtMat); - if (!tgtMatCrsPtr) { - std::cout << "srk error tgtMat type= " << typeid(tgtMat).name() << std::endl; - std::terminate(); - } - crs_matrix_type& tgtMatCrs = *tgtMatCrsPtr; - - std::string prefix = tfecfFuncName; - // const char* const prefix_raw = prefix.c_str(); - - const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed (); - //const bool targetIsLocallyIndexed = tgtMat.isLocallyIndexed (); - // - // Copy the first numSame row from source to target (this matrix). - // This involves copying rows corresponding to LIDs [0, numSame-1]. - // - const auto& srcRowMap = * (srcMat.getRowMap ()); - auto comm = srcRowMap.getComm(); - - - const LO numSameIDs_as_LID = static_cast (numSameIDs); - - if (sourceIsLocallyIndexed) { - - typedef typename crs_matrix_type::local_matrix_device_type k_local_matrix_device_type; - //typedef typename k_local_matrix_device_type::StaticCrsGraphType k_graph_t; - // typedef typename k_graph_t::row_map_type::non_const_type k_row_map_t; - // typedef typename k_graph_t::entries_type::non_const_type k_nnz_t; - // typedef typename k_local_matrix_device_type::values_type::non_const_type k_scalar_view_t; - - const k_local_matrix_device_type & srcMatDevice = srcMatCrs.getLocalMatrixDevice(); - const k_local_matrix_device_type & tgtMatDevice = tgtMatCrs.getLocalMatrixDevice(); - const k_local_matrix_device_type * srcMatDevicePtr = &srcMatDevice; - const k_local_matrix_device_type * tgtMatDevicePtr = &tgtMatDevice; - - // auto lclIndsUnpacked_device = srcMat.getLocalIndicesDevice (); - - // auto nr = srcMatDevice.graph.numRows(); - // // if ((size_t)numSameIDs_as_LID >= (size_t)nr) { - // // std::cout << "numSameIDs_as_LID= " << numSameIDs_as_LID << " nr= " << nr << std::endl; - // // } - // TEUCHOS_ASSERT((size_t)numSameIDs_as_LID <= (size_t)nr); - -#define PR1(a) std::cout << "[srk] " << #a << "= " << a << std::endl - - typename crs_matrix_type::row_ptrs_device_view_type tgtLocalRowPtrsDevice = tgtMatCrs.getLocalRowPtrsDevice(); - typename crs_matrix_type::local_inds_device_view_type tgtLocalColIndsDevice = tgtMatCrs.getLocalIndicesDevice(); - typename crs_matrix_type::row_ptrs_host_view_type srcLocalRowPtrsHost = srcMatCrs.getLocalRowPtrsHost(); - typename crs_matrix_type::row_ptrs_device_view_type srcLocalRowPtrsDevice = srcMatCrs.getLocalRowPtrsDevice(); - typename crs_matrix_type::local_inds_device_view_type srcLocalColIndsDevice = srcMatCrs.getLocalIndicesDevice(); - - nonconst_global_inds_device_view_type srowInfo(Kokkos::ViewAllocateWithoutInitializing("srowInfo"), numSameIDs_as_LID); - - printf("here fence 0 numSameIDs_as_LID= %ld\n", numSameIDs_as_LID); - Kokkos::fence("srk0"); - printf("here fence 1\n"); - - typedef typename Node::execution_space exec_space; - typedef Kokkos::RangePolicy range_type; - - size_t mre=0; - for (LO sourceLID=0; sourceLID < numSameIDs_as_LID; sourceLID++) { - auto start = srcLocalRowPtrsHost(sourceLID); - auto end = srcLocalRowPtrsHost(sourceLID+1); - size_t rowLength = static_cast(end - start); - printf("sourceLID= %d start= %d end= %d rowLength= %ld\n", sourceLID, start, end, rowLength); - if (rowLength > mre) mre = rowLength; - } - printf("here b4 row_map, max_row_entries=%ld\n", mre); // prints 33 - Kokkos::parallel_for - ("Tpetra_CrsMatrix::copyAndPermuteStaticGraph", - range_type (0, numSameIDs_as_LID), - KOKKOS_LAMBDA(const LO sourceLID) - { - auto start = srcMatDevice.graph.row_map(sourceLID); // always print 0 - auto end = srcMatDevice.graph.row_map(sourceLID+1); // these print correctly - size_t rowLength = static_cast(end - start); - printf("0 k_sourceLID= %d start= %d end= %d rowLength= %ld\n", sourceLID, start, end, rowLength); - //printf("k_sourceLID= %d\n", sourceLID); - //srowInfo(sourceLID) = rowLength; - }); // kokkos parallel_for - - printf("here fence 2.0\n"); - Kokkos::fence("srk00"); - printf("here fence 2\n"); - - printf("here b4 srowInfo, max_row_entries=%ld\n", mre); - Kokkos::parallel_for - ("Tpetra_CrsMatrix::copyAndPermuteStaticGraph", - range_type (0, numSameIDs_as_LID), - KOKKOS_LAMBDA(const LO sourceLID) - { - auto start = srcLocalRowPtrsDevice(sourceLID); - auto end = srcLocalRowPtrsDevice(sourceLID+1); - size_t rowLength = static_cast(end - start); - printf("1 k_sourceLID= %d start= %d end= %d rowLength= %ld\n", sourceLID, start, end, rowLength); - //printf("k_sourceLID= %d\n", sourceLID); - //srowInfo(sourceLID) = rowLength; - }); // kokkos parallel_for - - printf("here fence 2.0\n"); - Kokkos::fence("srk00"); - printf("here fence 2\n"); - - size_t max_row_entries = 0; - Kokkos::parallel_reduce ("Tpetra_CrsMatrix_capsg_get_max_nc", range_type (0, numSameIDs_as_LID), - KOKKOS_LAMBDA(const LO sourceLID, size_t& gmax) { - auto start = srcLocalRowPtrsDevice(sourceLID); - auto end = srcLocalRowPtrsDevice(sourceLID+1); - size_t ct = static_cast(end - start); - - if (ct > gmax) gmax = ct; - }, max_row_entries); - - printf("here 0-af-pr: max_row_entries= %ld mre= %ld\n", max_row_entries, mre); - max_row_entries = mre; - - Kokkos::fence("srk1"); - printf("here 0-af-fence:\n"); - - auto local_map = srcMat.getRowMap()->getLocalMap(); - auto local_map_ptr = &local_map; - auto local_col_map = srcMat.getColMap()->getLocalMap(); - auto local_col_map_ptr = &local_col_map; - - nonconst_global_inds_device_view_type rowInds(Kokkos::ViewAllocateWithoutInitializing("srk_rowInds"), max_row_entries); - nonconst_values_device_view_type rowVals(Kokkos::ViewAllocateWithoutInitializing("srk_rowVals"), max_row_entries); - - bool tgtMatIsSorted = tgtMatCrs.getCrsGraph()->isSorted(); - - using local_map_type = typename crs_matrix_type::map_type::local_map_type; - //using crs_graph_type = CrsGraph; - - local_map_type tgt_local_map = tgtMatCrs.getRowMap()->getLocalMap(); - local_map_type tgt_local_col_map = tgtMatCrs.getColMap()->getLocalMap(); - auto tgt_local_map_ptr = &tgt_local_map; - auto tgt_local_col_map_ptr = &tgt_local_col_map; - - auto my_replaceGlobalValuesImpl - = KOKKOS_LAMBDA( - const bool sorted, const bool atomic, size_t hint[], - const size_t numInTgtRow, const LO tgtColInds[], impl_scalar_type tgtRowVals[], - const size_t numToReplace, const GO inds[], const impl_scalar_type newVals[] - ) -> LO - { - LO numValid = 0; // number of valid input column indices - - if (atomic) { - for (LO j = 0; j < (LO)numToReplace; ++j) { - const LO lclColInd = tgt_local_col_map_ptr->getLocalElement(inds[j]); - if (lclColInd != LINV) { - const size_t offset = - KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, - lclColInd, hint[0], sorted); - if (offset != numInTgtRow) { - Kokkos::atomic_store (&tgtRowVals[offset], newVals[j]); - hint[0] = offset + 1; - numValid++; - } - } - } - } else { - for (LO j = 0; j < (LO)numToReplace; ++j) { - const LO lclColInd = tgt_local_col_map_ptr->getLocalElement (inds[j]); - if (lclColInd != LINV) { - const size_t offset = - KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, - lclColInd, hint[0], sorted); - if (offset != numInTgtRow) { - tgtRowVals[offset] = newVals[j]; - hint[0] = offset + 1; - numValid++; - } - } - } - } - return numValid; - }; - - printf("here 0: R: %d row_map.size= %d numSameIDs_as_LID= %d\n", comm->getRank(), srcMatDevicePtr->graph.row_map.extent(0), numSameIDs_as_LID); - - printf("here 001: %d\n", comm->getRank()); - Kokkos::fence("srk01"); - printf("here 002: %d\n", comm->getRank()); - - const GO rl0 = - Tpetra::Details::getEntryOnHost(srowInfo, 0); - - printf("here 01: %d %ld\n", comm->getRank(), rl0); - - auto vals = srcMatCrs.getLocalValuesDevice (Access::ReadOnly); - auto tvals = tgtMatCrs.getLocalValuesDevice (Access::ReadWrite); - - Kokkos::fence("srk01"); - - Kokkos::parallel_for - ("Tpetra_CrsMatrix::copyAndPermuteStaticGraph", - range_type (0, numSameIDs_as_LID), - KOKKOS_LAMBDA(const LO sourceLID) - { - //printf("sourceLID= %d\n", sourceLID); - - auto start = srcLocalRowPtrsDevice(sourceLID); - auto end = srcLocalRowPtrsDevice(sourceLID+1); - size_t rowLength = static_cast(end - start); - // srowInfo(sourceLID) = rowLength; - [[maybe_unused]] size_t checkRowLength = 0; - - const size_t numEntries = rowLength; - //auto dev_row_info = srcMatDevicePtr->row(sourceLID); - - //KOKKOS_ASSERT(dev_row_info.length == rowLength); - - checkRowLength = numEntries; // first side effect - -#ifdef COMP -# undef COMP -#endif -#define COMP(a,b) do { if (int(a) != int(b)) { std::cout << "error: " << #a << "= " << a << " " << #b << "= " << b << " line= " << __LINE__ << std::endl; std::terminate(); } } while(0) - - for (size_t j = 0; j < rowLength; j++) { - // //auto ci = dev_row_info.colidx(j); - auto ci = srcLocalColIndsDevice(start + j); - auto gi = local_col_map_ptr->getGlobalElement(ci); - // rowInds(j) = gi; - // rowVals(j) = vals(start + j); - } - - // auto tgt_dev_row_info = tgtMatDevicePtr->row(sourceLID); - // LO *tgtColInds = &tgt_dev_row_info.colidx(0); - // Scalar *tgtRowVals = &tgt_dev_row_info.value(0); - // size_t numInTgtRow = tgt_dev_row_info.length; - auto tstart = tgtLocalRowPtrsDevice(sourceLID); - auto tend = tgtLocalRowPtrsDevice(sourceLID + 1); - size_t numInTgtRow = static_cast(tend - tstart); - Scalar *tgtRowVals = &tvals(tstart); - const LO *tgtColInds = &tgtLocalColIndsDevice(tstart); - - size_t hint=0; - // my_replaceGlobalValuesImpl(tgtMatIsSorted, false, &hint, // tgt_local_col_map, - // numInTgtRow, tgtColInds, tgtRowVals, - // rowLength, rowInds.data(), rowVals.data()); - - }); // kokkos parallel_for - - - printf("here 02: %d\n", comm->getRank()); - - Kokkos::fence("srk02"); - - { - bool tsd = tgtMatCrs.get_values_unpacked_wdv().need_sync_device(); - bool tsh = tgtMatCrs.get_values_unpacked_wdv().need_sync_host(); - bool ssd = srcMatCrs.get_values_unpacked_wdv().need_sync_device(); - bool ssh = srcMatCrs.get_values_unpacked_wdv().need_sync_host(); - - PR1(tsd); - PR1(tsh); - PR1(ssd); - PR1(ssh); - } - - // if (tsd) tgtMatCrs.get_values_unpacked_wdv().sync_device(); - // if (tsh) tgtMatCrs.get_values_unpacked_wdv().sync_host(); - // if (ssd) srcMatCrs.get_values_unpacked_wdv().sync_device(); - // if (ssh) srcMatCrs.get_values_unpacked_wdv().sync_host(); - // auto tgtSyncView = tgtMatCrs.get_values_unpacked_wdv().getHostView(Access::ReadOnly); - // const RowInfo rowInfo = tgtMatCrs.getCrsGraph()->getRowInfo(0); - // auto t2 = tgtMatCrs.getCrsGraph()->getLocalIndsViewHost(rowInfo); - // std::cout << "[srk] tgtSyncView= " << tgtSyncView.extent(0) << " t2= " << t2.extent(0) << std::endl; - - Kokkos::fence("srk2"); - - { - bool tsd = tgtMatCrs.get_values_unpacked_wdv().need_sync_device(); - bool tsh = tgtMatCrs.get_values_unpacked_wdv().need_sync_host(); - bool ssd = srcMatCrs.get_values_unpacked_wdv().need_sync_device(); - bool ssh = srcMatCrs.get_values_unpacked_wdv().need_sync_host(); - - PR1(tsd); - PR1(tsh); - PR1(ssd); - PR1(ssh); - } - - printf("here 1: %d\n", comm->getRank()); - - - } else { - for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) { - // Global ID for the current row index in the source matrix. - // The first numSameIDs GIDs in the two input lists are the - // same, so sourceGID == targetGID in this case. - const GO sourceGID = srcRowMap.getGlobalElement (sourceLID); - const GO targetGID = sourceGID; - - Teuchos::ArrayView rowIndsConstView; - Teuchos::ArrayView rowValsConstView; - - typename crs_matrix_type::global_inds_host_view_type rowIndsView; - typename crs_matrix_type::values_host_view_type rowValsView; - srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface - - // Applying a permutation to a matrix with a static graph - // means REPLACE-ing entries. - tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, - rowValsConstView); - } - } - - // - // "Permute" part of "copy and permute." - // - - typename crs_matrix_type::nonconst_global_inds_host_view_type rowInds; - typename crs_matrix_type::nonconst_values_host_view_type rowVals; - - const auto& tgtRowMap = * (tgtMat.getRowMap ()); - for (size_t p = 0; p < numPermutes; ++p) { - const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]); - const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]); - - Teuchos::ArrayView rowIndsConstView; - Teuchos::ArrayView rowValsConstView; - - if (sourceIsLocallyIndexed) { - const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID); - if (rowLength > static_cast (rowInds.size ())) { - Kokkos::resize(rowInds,rowLength); - Kokkos::resize(rowVals,rowLength); - } - // Resizing invalidates an Array's views, so we must make new - // ones, even if rowLength hasn't changed. - typename crs_matrix_type::nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength)); - typename crs_matrix_type::nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength)); - - // The source matrix is locally indexed, so we have to get a - // copy. Really it's the GIDs that have to be copied (because - // they have to be converted from LIDs). - size_t checkRowLength = 0; - srcMat.getGlobalRowCopy(sourceGID, rowIndsView, - rowValsView, checkRowLength); - - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface - } - else { - typename crs_matrix_type::global_inds_host_view_type rowIndsView; - typename crs_matrix_type::values_host_view_type rowValsView; - srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView); - // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take - // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView - // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews - // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews - rowIndsConstView = Teuchos::ArrayView ( // BAD BAD BAD - rowIndsView.data(), rowIndsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - rowValsConstView = Teuchos::ArrayView ( // BAD BAD BAD - reinterpret_cast(rowValsView.data()), rowValsView.extent(0), - Teuchos::RCP_DISABLE_NODE_LOOKUP); - // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with - // KDDKDD UVM TEMPORARY: KokkosView interface - } - - tgtMatCrs.replaceGlobalValues(targetGID, rowIndsConstView, - rowValsConstView); - } - - } From 7c7a518c8e61a80bc9d7d0dc3f71cf8e6f0f2133 Mon Sep 17 00:00:00 2001 From: srkenno Date: Thu, 14 Nov 2024 14:52:06 -0700 Subject: [PATCH 05/23] allow repeat runs from the same executable; improve main timers output --- .../panzer/mini-em/example/BlockPrec/main.cpp | 104 ++++++++++++++---- 1 file changed, 81 insertions(+), 23 deletions(-) diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp index 93426dead9e2..d6fd106e55bf 100644 --- a/packages/panzer/mini-em/example/BlockPrec/main.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp @@ -109,7 +109,10 @@ bool in_eval_MV = false; bool in_eval_J = false; double timer_evalJ=0.0; double timer_capsg=0.0; +double timer_main=0.0; +int numRepeatRuns = 1; +int repeat = 0; template static T parallel_reduce(Teuchos::RCP > comm, T& localVal, Teuchos::EReductionType red) { @@ -144,6 +147,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) size_t fom_num_cells; { + // defaults for command-line options int x_elements=-1,y_elements=-1,z_elements=-1,basis_order=1; int workset_size=2000; @@ -304,7 +308,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) physicsEqSet.set("Integration Order", 2*basis_order); RCP mesh; - int dim; + int dim=3; Teuchos::RCP mesh_factory; { Teuchos::TimeMonitor tMmesh(*Teuchos::TimeMonitor::getNewTimer(std::string("Mini-EM: build mesh"))); @@ -781,8 +785,9 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) fom_num_cells = mesh->getEntityCounts(dim); mainTimer.stop(); + timer_main = mainTimer.totalElapsedTime(); if (comm->getRank() == 0) { - std::cout << "mainTimer= " << mainTimer.totalElapsedTime() << std::endl; + std::cout << "mainTimer(run: " << repeat << "/" << numRepeatRuns << ") = " << timer_main << std::endl; } if (use_timer) { @@ -833,12 +838,6 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) } else if (use_timer) { Teuchos::TimeMonitor::summarize(*out,false,true,false,Teuchos::Union,"",true); } - timer_evalJ = parallel_reduce(comm, timer_evalJ, Teuchos::REDUCE_MAX); - timer_capsg = parallel_reduce(comm, timer_capsg, Teuchos::REDUCE_MAX); - if (!comm->getRank()) { - std::cout << "[dbg] timer_evalJ= " << timer_evalJ << std::endl; - std::cout << "[dbg] timer_capsg= " << timer_capsg << std::endl; - } return EXIT_SUCCESS; } @@ -856,6 +855,8 @@ int main(int argc,char * argv[]){ const char * solverNames[5] = {"Augmentation", "MueLu", "ML", "CG", "GMRES"}; solverType solver = MUELU; clp.setOption("solver",&solver,5,solverValues,solverNames,"Solver that is used"); + clp.setOption("num-repeat-runs",&numRepeatRuns); + // bool useComplex = false; // clp.setOption("complex","real",&useComplex); clp.recogniseAllOptions(false); @@ -871,32 +872,89 @@ int main(int argc,char * argv[]){ // TEUCHOS_ASSERT(!useComplex); } - int retVal; + Teuchos::RCP > comm + = Teuchos::rcp_dynamic_cast >(Teuchos::DefaultComm::getComm()); + + int retVal=0; + std::vector timer_evalJ_vec(numRepeatRuns), timer_capsg_vec(numRepeatRuns), timer_main_vec(numRepeatRuns); + // ========================================================================================================================== + for (repeat=0; repeat < numRepeatRuns; ++repeat) { + // ========================================================================================================================== + + in_eval_J = false; + timer_main = 0.0; + timer_evalJ = 0.0; + timer_capsg = 0.0; + if (linAlgebra == linAlgTpetra) { // if (useComplex) { -// #if defined(HAVE_TPETRA_COMPLEX_DOUBLE) -// typedef typename panzer::BlockedTpetraLinearObjFactory,int,panzer::GlobalOrdinal> blockedLinObjFactory; -// retVal = main_,int,panzer::GlobalOrdinal,blockedLinObjFactory,true>(clp, argc, argv); -// #else -// std::cout << std::endl -// << "WARNING" << std::endl -// << "Tpetra was compiled without Scalar=std::complex." << std::endl << std::endl; -// return EXIT_FAILURE; -// #endif -// } else { - typedef typename panzer::BlockedTpetraLinearObjFactory blockedLinObjFactory; - retVal = main_(clp, argc, argv); -// } + // #if defined(HAVE_TPETRA_COMPLEX_DOUBLE) + // typedef typename panzer::BlockedTpetraLinearObjFactory,int,panzer::GlobalOrdinal> blockedLinObjFactory; + // retVal = main_,int,panzer::GlobalOrdinal,blockedLinObjFactory,true>(clp, argc, argv); + // #else + // std::cout << std::endl + // << "WARNING" << std::endl + // << "Tpetra was compiled without Scalar=std::complex." << std::endl << std::endl; + // return EXIT_FAILURE; + // #endif + // } else { + typedef typename panzer::BlockedTpetraLinearObjFactory blockedLinObjFactory; + retVal = main_(clp, argc, argv); + // } #ifdef PANZER_HAVE_EPETRA_STACK } else if (linAlgebra == linAlgEpetra) { // TEUCHOS_ASSERT(!useComplex); typedef typename panzer::BlockedEpetraLinearObjFactory blockedLinObjFactory; retVal = main_(clp, argc, argv); #endif - } else + } else { TEUCHOS_ASSERT(false); + } + + if (1) { + timer_main = parallel_reduce(comm, timer_main, Teuchos::REDUCE_MAX); + timer_evalJ = parallel_reduce(comm, timer_evalJ, Teuchos::REDUCE_MAX); + timer_capsg = parallel_reduce(comm, timer_capsg, Teuchos::REDUCE_MAX); + if (!comm->getRank()) { + std::cout << "[TIMER] repeat= " << repeat << " timer_evalJ= " << timer_evalJ << std::endl; + std::cout << "[TIMER] repeat= " << repeat << " timer_capsg= " << timer_capsg << std::endl; + timer_main_vec[repeat] = timer_main; + timer_evalJ_vec[repeat] = timer_evalJ; + timer_capsg_vec[repeat] = timer_capsg; + } + } + + + // ========================================================================================================================== + } //for (int repeat=0; repeat < numRepeatRuns; ++repeat) { + // ========================================================================================================================== + + auto minMaxAve = [&] (const std::vector& vec, double MinMaxAve[3]) { + MinMaxAve[0] = std::numeric_limits::max(); + MinMaxAve[1] = -MinMaxAve[0]; + MinMaxAve[2] = 0.0; + for (auto v : vec) { + MinMaxAve[0] = std::min(MinMaxAve[0], v); + MinMaxAve[1] = std::max(MinMaxAve[1], v); + MinMaxAve[2] += v / double(vec.size()); + } + }; + if (!comm->getRank()) { + double MinMaxAve[3][3]; + minMaxAve(timer_main_vec, MinMaxAve[0]); + minMaxAve(timer_evalJ_vec, MinMaxAve[1]); + minMaxAve(timer_capsg_vec, MinMaxAve[2]); + auto pr = [&](int j, const std::string& name) { + std::cout << "[TIMER] " << name << " AVE(" << numRepeatRuns << " runs): " << MinMaxAve[j][2] + << " MIN: " << MinMaxAve[j][0] << " MAX: " << MinMaxAve[j][1] + << " PAR-IMB: " << MinMaxAve[j][1]/(MinMaxAve[j][0] == 0.0 ? 1.0 : MinMaxAve[j][0]) << std::endl; + }; + pr(0, "timer_main"); + pr(1, "timer_evalJ"); + pr(2, "timer_capsg"); + } Kokkos::finalize(); From 081bd32cbdeae9db408b385b837354f4aaaddbd1 Mon Sep 17 00:00:00 2001 From: srkenno Date: Fri, 15 Nov 2024 13:29:07 -0700 Subject: [PATCH 06/23] force a commit --- packages/panzer/mini-em/example/BlockPrec/main.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp index d6fd106e55bf..c7504285b07f 100644 --- a/packages/panzer/mini-em/example/BlockPrec/main.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp @@ -925,7 +925,6 @@ int main(int argc,char * argv[]){ } } - // ========================================================================================================================== } //for (int repeat=0; repeat < numRepeatRuns; ++repeat) { // ========================================================================================================================== From 0f6b8605ed87a72334897b2c14e3eb32ba589118 Mon Sep 17 00:00:00 2001 From: srkenno Date: Fri, 15 Nov 2024 13:31:09 -0700 Subject: [PATCH 07/23] force a commit --- packages/panzer/mini-em/example/BlockPrec/main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp index c7504285b07f..553669683dd8 100644 --- a/packages/panzer/mini-em/example/BlockPrec/main.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp @@ -924,6 +924,7 @@ int main(int argc,char * argv[]){ timer_capsg_vec[repeat] = timer_capsg; } } + // ========================================================================================================================== } //for (int repeat=0; repeat < numRepeatRuns; ++repeat) { From 1280c6320df926fbd2fcdf2d0cd98ffcb4094ce9 Mon Sep 17 00:00:00 2001 From: srkenno Date: Tue, 19 Nov 2024 08:30:00 -0700 Subject: [PATCH 08/23] sort timers, print nicely --- .../src/Panzer_AssemblyEngine_impl.hpp | 12 +++ .../src/Panzer_ModelEvaluator_impl.hpp | 4 +- .../panzer/mini-em/example/BlockPrec/main.cpp | 80 +++++++++++++------ .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 26 ++++-- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 9 +-- 5 files changed, 94 insertions(+), 37 deletions(-) diff --git a/packages/panzer/disc-fe/src/Panzer_AssemblyEngine_impl.hpp b/packages/panzer/disc-fe/src/Panzer_AssemblyEngine_impl.hpp index 2c0c506ec771..d1b8da9e7a33 100644 --- a/packages/panzer/disc-fe/src/Panzer_AssemblyEngine_impl.hpp +++ b/packages/panzer/disc-fe/src/Panzer_AssemblyEngine_impl.hpp @@ -15,7 +15,13 @@ #include "Panzer_FieldManagerBuilder.hpp" #include "Panzer_AssemblyEngine_InArgs.hpp" #include "Panzer_GlobalEvaluationDataContainer.hpp" +#include "Teuchos_Time.hpp" #include +#include +#include + +extern std::unordered_map> >& Timers; +extern bool in_eval_J; //=========================================================================== //=========================================================================== @@ -87,18 +93,24 @@ evaluate(const panzer::AssemblyEngineInArgs& in, const EvaluationFlags flags) } if ( flags.getValue() & EvaluationFlags::Scatter) { + double time = Teuchos::Time::wallTime(); PANZER_FUNC_TIME_MONITOR_DIFF("panzer::AssemblyEngine::evaluate_scatter("+PHX::print()+")",eval_scatter); { + double time1 = Teuchos::Time::wallTime(); PANZER_FUNC_TIME_MONITOR_DIFF("panzer::AssemblyEngine::lof->ghostToGlobalContainer("+PHX::print()+")",lof_gtgc); m_lin_obj_factory->ghostToGlobalContainer(*in.ghostedContainer_,*in.container_,LOC::F | LOC::Mat); + if (in_eval_J) Timers["lof-g2gc"].first += -time1 + Teuchos::Time::wallTime(); } { + double time1 = Teuchos::Time::wallTime(); PANZER_FUNC_TIME_MONITOR_DIFF("panzer::AssemblyEngine::gedc.ghostToGlobal("+PHX::print()+")",gedc_gtg); m_lin_obj_factory->beginFill(*in.container_); gedc.ghostToGlobal(LOC::F | LOC::Mat); m_lin_obj_factory->endFill(*in.container_); + if (in_eval_J) Timers["gedc-g2g"].first += -time1 + Teuchos::Time::wallTime(); } m_lin_obj_factory->endFill(*in.ghostedContainer_); + if (in_eval_J) Timers["eval_scatter"].first += -time + Teuchos::Time::wallTime(); } return; diff --git a/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp b/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp index 07bbc5344489..62676ce4dad5 100644 --- a/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp +++ b/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp @@ -48,6 +48,8 @@ #include "Tpetra_CrsMatrix.hpp" extern bool in_eval_J; +extern std::unordered_map> >& Timers; + extern double timer_evalJ; extern double timer_capsg; @@ -1588,7 +1590,7 @@ evalModelImpl_basic(const Thyra::ModelEvaluatorBase::InArgs &inArgs, ae_tm_.template getAsObject()->evaluate(ae_inargs); in_eval_J = false; - timer_evalJ += -time_ + Teuchos::Time::wallTime(); + Timers["evalJ"].first += -time_ + Teuchos::Time::wallTime(); } // HACK: set A to null before calling responses to avoid touching the diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp index 553669683dd8..c46fb0da4229 100644 --- a/packages/panzer/mini-em/example/BlockPrec/main.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp @@ -103,6 +103,28 @@ bool panzer_impl_new = false; int panzer_impl_inp = 0; // 0, 1, 2=both +using TimersValType = std::pair>; +std::unordered_map TimersBase = + { + {"evalJ", {0.0, {}}}, + {"capsg", {0.0, {}}}, + {"capsg_G", {0.0, {}}}, + {"capsg_G_1", {0.0, {}}}, + {"capsg_G_2", {0.0, {}}}, + {"capsg_G_3", {0.0, {}}}, + {"capsg_G_4", {0.0, {}}}, + {"capsg_M", {0.0, {}}}, + {"capsg_G_pad", {0.0, {}}}, + {"capsg_G_apad", {0.0, {}}}, + {"main", {0.0, {}}}, + {"eval_scatter", {0.0, {}}}, + {"gedc-g2g", {0.0, {}}}, + {"lof-g2gc", {0.0, {}}}, + }; + +std::unordered_map& Timers = TimersBase; +std::unordered_map *TimersPtr = &TimersBase; + double timer_MV=0.0; double timer_ICI=0.0; bool in_eval_MV = false; @@ -785,9 +807,9 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) fom_num_cells = mesh->getEntityCounts(dim); mainTimer.stop(); - timer_main = mainTimer.totalElapsedTime(); + Timers["main"].first = mainTimer.totalElapsedTime(); if (comm->getRank() == 0) { - std::cout << "mainTimer(run: " << repeat << "/" << numRepeatRuns << ") = " << timer_main << std::endl; + std::cout << "mainTimer(run: " << repeat << "/" << numRepeatRuns << ") = " << Timers["main"].first << std::endl; } if (use_timer) { @@ -876,15 +898,16 @@ int main(int argc,char * argv[]){ = Teuchos::rcp_dynamic_cast >(Teuchos::DefaultComm::getComm()); int retVal=0; - std::vector timer_evalJ_vec(numRepeatRuns), timer_capsg_vec(numRepeatRuns), timer_main_vec(numRepeatRuns); + + for (auto& t : Timers) { + t.second.second.resize(numRepeatRuns); + } // ========================================================================================================================== for (repeat=0; repeat < numRepeatRuns; ++repeat) { // ========================================================================================================================== in_eval_J = false; - timer_main = 0.0; - timer_evalJ = 0.0; - timer_capsg = 0.0; + for (auto& t : Timers) t.second.first = 0.0; if (linAlgebra == linAlgTpetra) { @@ -913,15 +936,14 @@ int main(int argc,char * argv[]){ } if (1) { - timer_main = parallel_reduce(comm, timer_main, Teuchos::REDUCE_MAX); - timer_evalJ = parallel_reduce(comm, timer_evalJ, Teuchos::REDUCE_MAX); - timer_capsg = parallel_reduce(comm, timer_capsg, Teuchos::REDUCE_MAX); + for (auto& t : Timers) { + t.second.first = parallel_reduce(comm, t.second.first, Teuchos::REDUCE_MAX); + } if (!comm->getRank()) { - std::cout << "[TIMER] repeat= " << repeat << " timer_evalJ= " << timer_evalJ << std::endl; - std::cout << "[TIMER] repeat= " << repeat << " timer_capsg= " << timer_capsg << std::endl; - timer_main_vec[repeat] = timer_main; - timer_evalJ_vec[repeat] = timer_evalJ; - timer_capsg_vec[repeat] = timer_capsg; + for (auto& t : Timers) { + std::cout << "[TIMER] repeat= " << repeat << " " << t.first << " = " << t.second.first << std::endl; + t.second.second[repeat] = t.second.first; + } } } @@ -942,18 +964,26 @@ int main(int argc,char * argv[]){ }; if (!comm->getRank()) { - double MinMaxAve[3][3]; - minMaxAve(timer_main_vec, MinMaxAve[0]); - minMaxAve(timer_evalJ_vec, MinMaxAve[1]); - minMaxAve(timer_capsg_vec, MinMaxAve[2]); - auto pr = [&](int j, const std::string& name) { - std::cout << "[TIMER] " << name << " AVE(" << numRepeatRuns << " runs): " << MinMaxAve[j][2] - << " MIN: " << MinMaxAve[j][0] << " MAX: " << MinMaxAve[j][1] - << " PAR-IMB: " << MinMaxAve[j][1]/(MinMaxAve[j][0] == 0.0 ? 1.0 : MinMaxAve[j][0]) << std::endl; + double MinMaxAve[3]; + auto pr = [&](const std::string& name, double mma[]) { + printf("[TIMER] %25s %20.3f %20.3f %20.3f %20.3f\n", + name.c_str(), mma[2], mma[1], mma[0], mma[1]/(mma[0] == 0.0 ? 1.0 : mma[0]) ); + }; + auto title = [&]() { + printf("[TIMER] %d runs:\n" + "[TIMER] %25s %20s %20s %20s %20s\n", + numRepeatRuns, "Name", "AVE:", "MAX:", "MIN:", "MAX/MIN:"); }; - pr(0, "timer_main"); - pr(1, "timer_evalJ"); - pr(2, "timer_capsg"); + + using TimersDataType = std::pair ; + std::vector vt(Timers.begin(), Timers.end()); + std::sort(vt.begin(), vt.end(), [](const TimersDataType& a, const TimersDataType& b) { return a.second.first >= b.second.first; }); + title(); + for (const auto& t : vt) { + minMaxAve(t.second.second, MinMaxAve); + pr(t.first, MinMaxAve); + } + } Kokkos::finalize(); diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 3e1164909362..b754e8dd3bd9 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -43,17 +43,16 @@ #define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 #if EXP_INCLUDED_FROM_PANXER_MINI_EM +extern std::unordered_map> >& Timers; extern bool panzer_impl_new, panzer_impl_old; extern bool in_eval_J; -extern double timer_evalJ; -extern double timer_capsg; #else namespace { bool panzer_impl_new = true; bool panzer_impl_old = !panzer_impl_new; -bool in_eval_J = false; -double timer_evalJ=0.0; -double timer_capsg=0.0; +namespace { +const bool in_eval_J = false; +} } #endif @@ -1741,6 +1740,7 @@ namespace Tpetra { size_t numInserted; { auto gblIndsHostView = this->gblInds_wdv.getHostView(Access::ReadWrite); + // FIXME - device numInserted = Details::insertCrsIndices(lclRow, this->getRowPtrsUnpackedHost(), gblIndsHostView, numEntries, inputInds, fun); @@ -4868,6 +4868,7 @@ namespace Tpetra { const bool verbose = verbose_; Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermute"); + double capTime = Teuchos::Time::wallTime(); std::unique_ptr prefix; if (verbose) { @@ -4893,9 +4894,13 @@ namespace Tpetra { os << *prefix << "Compute padding" << endl; std::cerr << os.str (); } + double padTime = Teuchos::Time::wallTime(); auto padding = computeCrsPadding(srcRowGraph, numSameIDs, permuteToLIDs, permuteFromLIDs, verbose); + Timers["capsg_G_pad"].first += -padTime + Teuchos::Time::wallTime(); + double apadTime = Teuchos::Time::wallTime(); applyCrsPadding(*padding, verbose); + Timers["capsg_G_apad"].first += -apadTime + Teuchos::Time::wallTime(); // If the source object is actually a CrsGraph, we can use view // mode instead of copy mode to access the entries in each row, @@ -4923,6 +4928,8 @@ namespace Tpetra { // compatible with the expectations of view mode. Also, if the // source graph is not a CrsGraph, we can't use view mode, // because RowGraph only provides copy mode access to the data. + double time = Teuchos::Time::wallTime(); + for (size_t i = 0; i < numSameIDs; ++i, ++myid) { const GO gid = srcRowMap.getGlobalElement (myid); size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); @@ -4931,7 +4938,9 @@ namespace Tpetra { srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); this->insertGlobalIndices (gid, row_length, row_copy.data()); } + if (in_eval_J) Timers["capsg_G_1"].first += -time + Teuchos::Time::wallTime(); } else { + double time = Teuchos::Time::wallTime(); if (verbose) { std::ostringstream os; os << *prefix << "! src_filled && srcCrsGraph != nullptr" << endl; @@ -4943,6 +4952,7 @@ namespace Tpetra { srcCrsGraph->getGlobalRowView (gid, row); this->insertGlobalIndices (gid, row.extent(0), row.data()); } + if (in_eval_J) Timers["capsg_G_2"].first += -time + Teuchos::Time::wallTime(); } // @@ -4952,6 +4962,7 @@ namespace Tpetra { auto permuteFromLIDs_h = permuteFromLIDs.view_host (); if (src_filled || srcCrsGraph == nullptr) { + double time = Teuchos::Time::wallTime(); for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); @@ -4961,7 +4972,9 @@ namespace Tpetra { srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); this->insertGlobalIndices (mygid, row_length, row_copy.data()); } + if (in_eval_J) Timers["capsg_G_3"].first += -time + Teuchos::Time::wallTime(); } else { + double time = Teuchos::Time::wallTime(); for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); @@ -4969,6 +4982,7 @@ namespace Tpetra { srcCrsGraph->getGlobalRowView (srcgid, row); this->insertGlobalIndices (mygid, row.extent(0), row.data()); } + if (in_eval_J) Timers["capsg_G_4"].first += -time + Teuchos::Time::wallTime(); } if (verbose) { @@ -4976,6 +4990,8 @@ namespace Tpetra { os << *prefix << "Done" << endl; std::cerr << os.str (); } + if (in_eval_J) Timers["capsg_G"].first += -capTime + Teuchos::Time::wallTime(); + } template diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 6e3db9a854c0..2124af0a914c 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -63,16 +63,13 @@ #define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 #if EXP_INCLUDED_FROM_PANXER_MINI_EM extern bool panzer_impl_new, panzer_impl_old; +extern std::unordered_map> >& Timers; extern bool in_eval_J; -extern double timer_evalJ; -extern double timer_capsg; #else namespace { bool panzer_impl_new = true; bool panzer_impl_old = !panzer_impl_new; bool in_eval_J = false; -double timer_evalJ=0.0; -double timer_capsg=0.0; } #endif @@ -6379,7 +6376,7 @@ CrsMatrix:: permuteFromLIDs_d.data(), numPermute); if (in_eval_J) { - timer_capsg += -time_ + Teuchos::Time::wallTime(); + Timers["capsg_M"].first += -time_ + Teuchos::Time::wallTime(); } @@ -6396,7 +6393,7 @@ CrsMatrix:: permuteFromLIDs_h.data(), numPermute); if (in_eval_J) { - timer_capsg += -time_ + Teuchos::Time::wallTime(); + Timers["capsg_M"].first += -time_ + Teuchos::Time::wallTime(); } } From f608765d662c37b251124e406ed58958453999ad Mon Sep 17 00:00:00 2001 From: srkenno Date: Tue, 19 Nov 2024 09:57:25 -0700 Subject: [PATCH 09/23] tweaks to timers --- .../src/Panzer_ModelEvaluator_impl.hpp | 3 ++- .../panzer/mini-em/example/BlockPrec/main.cpp | 26 ++++++++++++------- .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 8 +++--- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp b/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp index 62676ce4dad5..c7ec62c2436d 100644 --- a/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp +++ b/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp @@ -47,6 +47,7 @@ #include "Thyra_TpetraLinearOp.hpp" #include "Tpetra_CrsMatrix.hpp" +extern bool use_eval_J; extern bool in_eval_J; extern std::unordered_map> >& Timers; @@ -1589,7 +1590,7 @@ evalModelImpl_basic(const Thyra::ModelEvaluatorBase::InArgs &inArgs, thGhostedContainer->initializeMatrix(0.0); ae_tm_.template getAsObject()->evaluate(ae_inargs); - in_eval_J = false; + in_eval_J = !use_eval_J; Timers["evalJ"].first += -time_ + Teuchos::Time::wallTime(); } diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp index c46fb0da4229..36d91fd648bc 100644 --- a/packages/panzer/mini-em/example/BlockPrec/main.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp @@ -128,6 +128,7 @@ std::unordered_map *TimersPtr = &TimersBase; double timer_MV=0.0; double timer_ICI=0.0; bool in_eval_MV = false; +bool use_eval_J = false; // override for the next value bool in_eval_J = false; double timer_evalJ=0.0; double timer_capsg=0.0; @@ -257,7 +258,9 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) return EXIT_FAILURE; } - std::cout << "P" << comm->getRank() << ": [dbg] panzer_impl_old= " << panzer_impl_old << " panzer_impl_new= " << panzer_impl_new << std::endl; + std::cout << "P" << comm->getRank() << ": [dbg] panzer_impl_old= " << panzer_impl_old << " panzer_impl_new= " << panzer_impl_new + << " use_eval_J= " << use_eval_J + << std::endl; #ifdef HAVE_TEUCHOS_STACKTRACE @@ -878,6 +881,7 @@ int main(int argc,char * argv[]){ solverType solver = MUELU; clp.setOption("solver",&solver,5,solverValues,solverNames,"Solver that is used"); clp.setOption("num-repeat-runs",&numRepeatRuns); + clp.setOption("use-evalJ","no-use-evalJ", &use_eval_J,"Run with sub-timers only active if evalModel(J) is active."); // bool useComplex = false; // clp.setOption("complex","real",&useComplex); @@ -906,7 +910,9 @@ int main(int argc,char * argv[]){ for (repeat=0; repeat < numRepeatRuns; ++repeat) { // ========================================================================================================================== - in_eval_J = false; + in_eval_J = !use_eval_J; + std::cout << "P" << comm->getRank() << ": [dbg] use_eval_J= " << use_eval_J << std::endl; + for (auto& t : Timers) t.second.first = 0.0; if (linAlgebra == linAlgTpetra) { @@ -935,17 +941,17 @@ int main(int argc,char * argv[]){ TEUCHOS_ASSERT(false); } - if (1) { + if (1) { + for (auto& t : Timers) { + t.second.first = parallel_reduce(comm, t.second.first, Teuchos::REDUCE_MAX); + } + if (!comm->getRank()) { for (auto& t : Timers) { - t.second.first = parallel_reduce(comm, t.second.first, Teuchos::REDUCE_MAX); - } - if (!comm->getRank()) { - for (auto& t : Timers) { - std::cout << "[TIMER] repeat= " << repeat << " " << t.first << " = " << t.second.first << std::endl; - t.second.second[repeat] = t.second.first; - } + std::cout << "[TIMER] repeat= " << repeat << " " << t.first << " = " << t.second.first << std::endl; + t.second.second[repeat] = t.second.first; } } + } // ========================================================================================================================== diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index b754e8dd3bd9..8a847155d7bd 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -4897,10 +4897,10 @@ namespace Tpetra { double padTime = Teuchos::Time::wallTime(); auto padding = computeCrsPadding(srcRowGraph, numSameIDs, permuteToLIDs, permuteFromLIDs, verbose); - Timers["capsg_G_pad"].first += -padTime + Teuchos::Time::wallTime(); + if (in_eval_J) Timers["capsg_G_pad"].first += -padTime + Teuchos::Time::wallTime(); double apadTime = Teuchos::Time::wallTime(); applyCrsPadding(*padding, verbose); - Timers["capsg_G_apad"].first += -apadTime + Teuchos::Time::wallTime(); + if (in_eval_J) Timers["capsg_G_apad"].first += -apadTime + Teuchos::Time::wallTime(); // If the source object is actually a CrsGraph, we can use view // mode instead of copy mode to access the entries in each row, @@ -4990,7 +4990,9 @@ namespace Tpetra { os << *prefix << "Done" << endl; std::cerr << os.str (); } - if (in_eval_J) Timers["capsg_G"].first += -capTime + Teuchos::Time::wallTime(); + if (in_eval_J) { + Timers["capsg_G"].first += -capTime + Teuchos::Time::wallTime(); + } } From 99357879becb5ae624acd2e9ce72bc36c40f3a57 Mon Sep 17 00:00:00 2001 From: srkenno Date: Wed, 20 Nov 2024 15:29:37 -0700 Subject: [PATCH 10/23] improve timer title to reflect the average is ave(repeat runs), not ave(per-core-time) --- packages/panzer/mini-em/example/BlockPrec/main.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp index 36d91fd648bc..3e2b642f9174 100644 --- a/packages/panzer/mini-em/example/BlockPrec/main.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp @@ -107,7 +107,6 @@ using TimersValType = std::pair>; std::unordered_map TimersBase = { {"evalJ", {0.0, {}}}, - {"capsg", {0.0, {}}}, {"capsg_G", {0.0, {}}}, {"capsg_G_1", {0.0, {}}}, {"capsg_G_2", {0.0, {}}}, @@ -976,9 +975,11 @@ int main(int argc,char * argv[]){ name.c_str(), mma[2], mma[1], mma[0], mma[1]/(mma[0] == 0.0 ? 1.0 : mma[0]) ); }; auto title = [&]() { + std::ostringstream oss; + oss << "AVE (of " << numRepeatRuns << " runs):"; printf("[TIMER] %d runs:\n" "[TIMER] %25s %20s %20s %20s %20s\n", - numRepeatRuns, "Name", "AVE:", "MAX:", "MIN:", "MAX/MIN:"); + numRepeatRuns, "Name", oss.str().c_str(), "MAX:", "MIN:", "MAX/MIN:"); }; using TimersDataType = std::pair ; From 4c03a4ff3969aecb8ea00806fc69ac7eb72c240f Mon Sep 17 00:00:00 2001 From: srkenno Date: Wed, 27 Nov 2024 10:00:34 -0700 Subject: [PATCH 11/23] add some timers around graph ops, including compute/apply padding; add option to sort timers using stable_sort - std::sort is segfaulting some times, possibly a compiler/optimization bug since valgrind didn't find an issue --- .../panzer/mini-em/example/BlockPrec/main.cpp | 73 +++++++++++++------ .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 5 ++ .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 8 ++ .../core/src/Tpetra_Details_CrsPadding.hpp | 9 +++ 4 files changed, 73 insertions(+), 22 deletions(-) diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp index 3e2b642f9174..78897f9db6fb 100644 --- a/packages/panzer/mini-em/example/BlockPrec/main.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp @@ -104,25 +104,37 @@ bool panzer_impl_new = false; int panzer_impl_inp = 0; // 0, 1, 2=both using TimersValType = std::pair>; -std::unordered_map TimersBase = +using TimersDataType = std::pair ; +using TimersType = std::unordered_map; + +TimersType TimersBase = { - {"evalJ", {0.0, {}}}, - {"capsg_G", {0.0, {}}}, - {"capsg_G_1", {0.0, {}}}, - {"capsg_G_2", {0.0, {}}}, - {"capsg_G_3", {0.0, {}}}, - {"capsg_G_4", {0.0, {}}}, - {"capsg_M", {0.0, {}}}, - {"capsg_G_pad", {0.0, {}}}, - {"capsg_G_apad", {0.0, {}}}, - {"main", {0.0, {}}}, - {"eval_scatter", {0.0, {}}}, - {"gedc-g2g", {0.0, {}}}, - {"lof-g2gc", {0.0, {}}}, + {"test", {0,{}}}, + {"capsg_G", {0,{}}}, + {"capsg_G_1", {0,{}}}, + {"capsg_G_2", {0,{}}}, + {"capsg_G_3", {0,{}}}, + {"capsg_G_4", {0,{}}}, + {"capsg_G_apad", {0,{}}}, + {"capsg_G_pad", {0,{}}}, + {"capsg_G_pad_merge", {0,{}}}, + {"capsg_G_pad_perm", {0,{}}}, + {"capsg_G_pad_same", {0,{}}}, + {"capsg_G_pad_sort", {0,{}}}, + {"capsg_M", {0,{}}}, + {"capsg_M_apad", {0,{}}}, + {"capsg_M_pad", {0,{}}}, + {"capsg_M_ucmac", {0,{}}}, + {"evalJ", {0,{}}}, + {"eval_scatter", {0,{}}}, + {"gedc-g2g", {0,{}}}, + {"lof-g2gc", {0,{}}}, + {"main", {0,{}}} }; -std::unordered_map& Timers = TimersBase; -std::unordered_map *TimersPtr = &TimersBase; +const size_t TimersBaseSize = TimersBase.size(); +auto TimersVal = TimersBase; +std::unordered_map& Timers = TimersVal; double timer_MV=0.0; double timer_ICI=0.0; @@ -133,8 +145,8 @@ double timer_evalJ=0.0; double timer_capsg=0.0; double timer_main=0.0; -int numRepeatRuns = 1; -int repeat = 0; +size_t numRepeatRuns = 1; +size_t repeat = 0; template static T parallel_reduce(Teuchos::RCP > comm, T& localVal, Teuchos::EReductionType red) { @@ -878,9 +890,11 @@ int main(int argc,char * argv[]){ solverType solverValues[5] = {AUGMENTATION, MUELU, ML, CG, GMRES}; const char * solverNames[5] = {"Augmentation", "MueLu", "ML", "CG", "GMRES"}; solverType solver = MUELU; + bool use_stable_sort=true; clp.setOption("solver",&solver,5,solverValues,solverNames,"Solver that is used"); clp.setOption("num-repeat-runs",&numRepeatRuns); clp.setOption("use-evalJ","no-use-evalJ", &use_eval_J,"Run with sub-timers only active if evalModel(J) is active."); + clp.setOption("stable-sort","no-stable-sort",&use_stable_sort,"use std::stable_sort in timers output."); // bool useComplex = false; // clp.setOption("complex","real",&useComplex); @@ -941,12 +955,22 @@ int main(int argc,char * argv[]){ } if (1) { + if (!comm->getRank()) { + std::cout << "[srk] TimersBaseSize= " << TimersBaseSize << " TimersBase.size= " << TimersBase.size() + << " Timers.size= " << Timers.size() << std::endl; + + TEUCHOS_TEST_FOR_EXCEPTION(TimersBaseSize != Timers.size(), + std::runtime_error, + "Timers not consistent, check TimersBase has all timers."); + } + for (auto& t : Timers) { t.second.first = parallel_reduce(comm, t.second.first, Teuchos::REDUCE_MAX); } if (!comm->getRank()) { for (auto& t : Timers) { std::cout << "[TIMER] repeat= " << repeat << " " << t.first << " = " << t.second.first << std::endl; + if (t.second.second.size() != numRepeatRuns) t.second.second.resize(numRepeatRuns); t.second.second[repeat] = t.second.first; } } @@ -977,14 +1001,20 @@ int main(int argc,char * argv[]){ auto title = [&]() { std::ostringstream oss; oss << "AVE (of " << numRepeatRuns << " runs):"; - printf("[TIMER] %d runs:\n" + printf("[TIMER] %zu runs:\n" "[TIMER] %25s %20s %20s %20s %20s\n", numRepeatRuns, "Name", oss.str().c_str(), "MAX:", "MIN:", "MAX/MIN:"); }; - using TimersDataType = std::pair ; + std::vector vt(Timers.begin(), Timers.end()); - std::sort(vt.begin(), vt.end(), [](const TimersDataType& a, const TimersDataType& b) { return a.second.first >= b.second.first; }); + // std::sort hits a segfault, stable_sort doesn't, go figure + auto lam = [](const TimersDataType& a, const TimersDataType& b) { return a.second.first >= b.second.first; }; + if (use_stable_sort) { + std::stable_sort(vt.begin(), vt.end(), lam); + } else { + std::sort(vt.begin(), vt.end(), lam); + } title(); for (const auto& t : vt) { minMaxAve(t.second.second, MinMaxAve); @@ -998,7 +1028,6 @@ int main(int argc,char * argv[]){ return retVal; } - template void writeToExodus(double time_stamp, const Teuchos::RCP > & x, diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 8a847155d7bd..3a1466ca875c 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -5172,10 +5172,15 @@ namespace Tpetra { new padding_type(myRank, numSameIDs, permuteFromLIDs.extent(0))); + double sameTime = Teuchos::Time::wallTime(); computeCrsPaddingForSameIDs(*padding, source, static_cast(numSameIDs)); + Timers["capsg_G_pad_same"].first += -sameTime + Teuchos::Time::wallTime(); + + double permTime = Teuchos::Time::wallTime(); computeCrsPaddingForPermutedIDs(*padding, source, permuteToLIDs, permuteFromLIDs); + Timers["capsg_G_pad_perm"].first += -permTime + Teuchos::Time::wallTime(); return padding; } diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 2124af0a914c..4c1faf038e51 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -7349,18 +7349,24 @@ CrsMatrix:: } if (isStaticGraph ()) { + double padTime = Teuchos::Time::wallTime(); + using Details::unpackCrsMatrixAndCombineNew; unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID, importLIDs, constantNumPackets, combineMode); + Timers["capsg_M_ucmac"].first += -padTime + Teuchos::Time::wallTime(); + } else { { using padding_type = typename crs_graph_type::padding_type; std::unique_ptr padding; try { + double padTime = Teuchos::Time::wallTime(); padding = myGraph_->computePaddingForCrsMatrixUnpack( importLIDs, imports, numPacketsPerLID, verbose); + Timers["capsg_M_pad"].first += -padTime + Teuchos::Time::wallTime(); } catch (std::exception& e) { const auto rowMap = getRowMap(); @@ -7377,7 +7383,9 @@ CrsMatrix:: os << *prefix << "Call applyCrsPadding" << endl; std::cerr << os.str(); } + double padTime = Teuchos::Time::wallTime(); applyCrsPadding(*padding, verbose); + Timers["capsg_M_apad"].first += -padTime + Teuchos::Time::wallTime(); } if (verbose) { std::ostringstream os; diff --git a/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp b/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp index 74a42a51d50b..bfa7f526c9dd 100644 --- a/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp @@ -12,11 +12,16 @@ #include "Tpetra_Details_Behavior.hpp" #include "Tpetra_Util.hpp" +#include "Teuchos_Time.hpp" #include #include #include #include #include +#include +#include + +extern std::unordered_map> >& Timers; namespace Tpetra { namespace Details { @@ -170,6 +175,7 @@ namespace Tpetra { // FIXME (08 Feb 2020) We only need to sort and unique // tgtGblColInds if we haven't already seen it before. + double time_ = Teuchos::Time::wallTime(); size_t newNumTgtEnt = origNumTgtEnt; auto tgtEnd = tgtGblColInds + origNumTgtEnt; std::sort(tgtGblColInds, tgtEnd); @@ -193,10 +199,13 @@ namespace Tpetra { newNumSrcEnt = size_t(srcEnd - srcGblColInds); TEUCHOS_ASSERT( newNumSrcEnt <= origNumSrcEnt ); } + Timers["capsg_G_pad_sort"].first += -time_ + Teuchos::Time::wallTime(); + time_ = Teuchos::Time::wallTime(); merge_with_current_state(phase, whichImport, targetLocalIndex, tgtGblColInds, newNumTgtEnt, srcGblColInds, newNumSrcEnt); + Timers["capsg_G_pad_merge"].first += -time_ + Teuchos::Time::wallTime(); if (verbose_) { std::ostringstream os; os << *prefix << "Done" << endl; From cc22d9f27160868d5020e930e39d57ae1508e466 Mon Sep 17 00:00:00 2001 From: srkenno Date: Tue, 3 Dec 2024 13:09:42 -0700 Subject: [PATCH 12/23] fix to allow use outside panzer --- .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 30 ++++++++++++++----- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 7 ++++- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index c3082d354a63..5f0a38bbccfc 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -50,9 +50,11 @@ extern double timer_evalJ; extern double timer_capsg; #define PANZER_IMPL_NEW panzer_impl_new #define PANZER_IMPL_OLD panzer_impl_old +#define IN_EVAL_J in_eval_J #else #define PANZER_IMPL_NEW true #define PANZER_IMPL_OLD false +#define IN_EVAL_J true #endif namespace Tpetra { @@ -4896,10 +4898,14 @@ namespace Tpetra { double padTime = Teuchos::Time::wallTime(); auto padding = computeCrsPadding(srcRowGraph, numSameIDs, permuteToLIDs, permuteFromLIDs, verbose); - if (in_eval_J) Timers["capsg_G_pad"].first += -padTime + Teuchos::Time::wallTime(); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM + if (IN_EVAL_J) Timers["capsg_G_pad"].first += -padTime + Teuchos::Time::wallTime(); +#endif double apadTime = Teuchos::Time::wallTime(); applyCrsPadding(*padding, verbose); - if (in_eval_J) Timers["capsg_G_apad"].first += -apadTime + Teuchos::Time::wallTime(); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM + if (IN_EVAL_J) Timers["capsg_G_apad"].first += -apadTime + Teuchos::Time::wallTime(); +#endif // If the source object is actually a CrsGraph, we can use view // mode instead of copy mode to access the entries in each row, @@ -4937,7 +4943,9 @@ namespace Tpetra { srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); this->insertGlobalIndices (gid, row_length, row_copy.data()); } - if (in_eval_J) Timers["capsg_G_1"].first += -time + Teuchos::Time::wallTime(); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM + if (IN_EVAL_J) Timers["capsg_G_1"].first += -time + Teuchos::Time::wallTime(); +#endif } else { double time = Teuchos::Time::wallTime(); if (verbose) { @@ -4951,7 +4959,9 @@ namespace Tpetra { srcCrsGraph->getGlobalRowView (gid, row); this->insertGlobalIndices (gid, row.extent(0), row.data()); } - if (in_eval_J) Timers["capsg_G_2"].first += -time + Teuchos::Time::wallTime(); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM + if (IN_EVAL_J) Timers["capsg_G_2"].first += -time + Teuchos::Time::wallTime(); +#endif } // @@ -4971,7 +4981,9 @@ namespace Tpetra { srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); this->insertGlobalIndices (mygid, row_length, row_copy.data()); } - if (in_eval_J) Timers["capsg_G_3"].first += -time + Teuchos::Time::wallTime(); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM + if (IN_EVAL_J) Timers["capsg_G_3"].first += -time + Teuchos::Time::wallTime(); +#endif } else { double time = Teuchos::Time::wallTime(); for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { @@ -4981,7 +4993,9 @@ namespace Tpetra { srcCrsGraph->getGlobalRowView (srcgid, row); this->insertGlobalIndices (mygid, row.extent(0), row.data()); } - if (in_eval_J) Timers["capsg_G_4"].first += -time + Teuchos::Time::wallTime(); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM + if (IN_EVAL_J) Timers["capsg_G_4"].first += -time + Teuchos::Time::wallTime(); +#endif } if (verbose) { @@ -4989,9 +5003,11 @@ namespace Tpetra { os << *prefix << "Done" << endl; std::cerr << os.str (); } - if (in_eval_J) { +#if EXP_INCLUDED_FROM_PANXER_MINI_EM + if (IN_EVAL_J) { Timers["capsg_G"].first += -capTime + Teuchos::Time::wallTime(); } +#endif } diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index dd2554d54ccb..7316e467d741 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -7358,8 +7358,9 @@ CrsMatrix:: unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID, importLIDs, constantNumPackets, combineMode); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM Timers["capsg_M_ucmac"].first += -padTime + Teuchos::Time::wallTime(); - +#endif } else { { @@ -7369,7 +7370,9 @@ CrsMatrix:: double padTime = Teuchos::Time::wallTime(); padding = myGraph_->computePaddingForCrsMatrixUnpack( importLIDs, imports, numPacketsPerLID, verbose); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM Timers["capsg_M_pad"].first += -padTime + Teuchos::Time::wallTime(); +#endif } catch (std::exception& e) { const auto rowMap = getRowMap(); @@ -7388,7 +7391,9 @@ CrsMatrix:: } double padTime = Teuchos::Time::wallTime(); applyCrsPadding(*padding, verbose); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM Timers["capsg_M_apad"].first += -padTime + Teuchos::Time::wallTime(); +#endif } if (verbose) { std::ostringstream os; From db2b14c9a7607d8cec88ccb02aaaa80473437b0f Mon Sep 17 00:00:00 2001 From: srkenno Date: Tue, 3 Dec 2024 14:29:50 -0700 Subject: [PATCH 13/23] protect new code --- .../disc-fe/src/Panzer_AssemblyEngine_impl.hpp | 15 ++++++++++++--- .../disc-fe/src/Panzer_ModelEvaluator_impl.hpp | 9 ++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/packages/panzer/disc-fe/src/Panzer_AssemblyEngine_impl.hpp b/packages/panzer/disc-fe/src/Panzer_AssemblyEngine_impl.hpp index d1b8da9e7a33..84f29d08bd53 100644 --- a/packages/panzer/disc-fe/src/Panzer_AssemblyEngine_impl.hpp +++ b/packages/panzer/disc-fe/src/Panzer_AssemblyEngine_impl.hpp @@ -20,8 +20,11 @@ #include #include +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 +#if EXP_INCLUDED_FROM_PANXER_MINI_EM extern std::unordered_map> >& Timers; extern bool in_eval_J; +#endif //=========================================================================== //=========================================================================== @@ -93,24 +96,30 @@ evaluate(const panzer::AssemblyEngineInArgs& in, const EvaluationFlags flags) } if ( flags.getValue() & EvaluationFlags::Scatter) { - double time = Teuchos::Time::wallTime(); + [[maybe_unused]] double time = Teuchos::Time::wallTime(); PANZER_FUNC_TIME_MONITOR_DIFF("panzer::AssemblyEngine::evaluate_scatter("+PHX::print()+")",eval_scatter); { - double time1 = Teuchos::Time::wallTime(); + [[maybe_unused]] double time1 = Teuchos::Time::wallTime(); PANZER_FUNC_TIME_MONITOR_DIFF("panzer::AssemblyEngine::lof->ghostToGlobalContainer("+PHX::print()+")",lof_gtgc); m_lin_obj_factory->ghostToGlobalContainer(*in.ghostedContainer_,*in.container_,LOC::F | LOC::Mat); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM if (in_eval_J) Timers["lof-g2gc"].first += -time1 + Teuchos::Time::wallTime(); +#endif } { - double time1 = Teuchos::Time::wallTime(); + [[maybe_unused]] double time1 = Teuchos::Time::wallTime(); PANZER_FUNC_TIME_MONITOR_DIFF("panzer::AssemblyEngine::gedc.ghostToGlobal("+PHX::print()+")",gedc_gtg); m_lin_obj_factory->beginFill(*in.container_); gedc.ghostToGlobal(LOC::F | LOC::Mat); m_lin_obj_factory->endFill(*in.container_); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM if (in_eval_J) Timers["gedc-g2g"].first += -time1 + Teuchos::Time::wallTime(); +#endif } m_lin_obj_factory->endFill(*in.ghostedContainer_); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM if (in_eval_J) Timers["eval_scatter"].first += -time + Teuchos::Time::wallTime(); +#endif } return; diff --git a/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp b/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp index c7ec62c2436d..0a3dcc88c451 100644 --- a/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp +++ b/packages/panzer/disc-fe/src/Panzer_ModelEvaluator_impl.hpp @@ -47,12 +47,15 @@ #include "Thyra_TpetraLinearOp.hpp" #include "Tpetra_CrsMatrix.hpp" + +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 +#if EXP_INCLUDED_FROM_PANXER_MINI_EM extern bool use_eval_J; extern bool in_eval_J; extern std::unordered_map> >& Timers; - extern double timer_evalJ; extern double timer_capsg; +#endif // Constructors/Initializers/Accessors @@ -1576,8 +1579,10 @@ evalModelImpl_basic(const Thyra::ModelEvaluatorBase::InArgs &inArgs, else if(Teuchos::is_null(f_out) && !Teuchos::is_null(W_out)) { PANZER_FUNC_TIME_MONITOR("panzer::ModelEvaluator::evalModel(J)"); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM double time_ = Teuchos::Time::wallTime(); in_eval_J = true; +#endif // only add auxiliary global data if Jacobian is being formed ae_inargs.addGlobalEvaluationData(nonParamGlobalEvaluationData_); @@ -1590,8 +1595,10 @@ evalModelImpl_basic(const Thyra::ModelEvaluatorBase::InArgs &inArgs, thGhostedContainer->initializeMatrix(0.0); ae_tm_.template getAsObject()->evaluate(ae_inargs); +#if EXP_INCLUDED_FROM_PANXER_MINI_EM in_eval_J = !use_eval_J; Timers["evalJ"].first += -time_ + Teuchos::Time::wallTime(); +#endif } // HACK: set A to null before calling responses to avoid touching the From fc61b0354271e11772197afd2cf59b61bad2a6fb Mon Sep 17 00:00:00 2001 From: srkenno Date: Fri, 6 Dec 2024 06:59:38 -0700 Subject: [PATCH 14/23] protect timers Signed-off-by: srkenno --- .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 59 +++++++------------ .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 37 ++++-------- .../core/src/Tpetra_Details_CrsPadding.hpp | 24 ++++++-- 3 files changed, 53 insertions(+), 67 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 5f0a38bbccfc..4eb94e2368ad 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -43,6 +43,7 @@ #define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 #if EXP_INCLUDED_FROM_PANXER_MINI_EM +#define INCL_EXP(a) a extern std::unordered_map> >& Timers; extern bool panzer_impl_new, panzer_impl_old; extern bool in_eval_J; @@ -52,6 +53,7 @@ extern double timer_capsg; #define PANZER_IMPL_OLD panzer_impl_old #define IN_EVAL_J in_eval_J #else +#define INCL_EXP(a) do {} while(0) #define PANZER_IMPL_NEW true #define PANZER_IMPL_OLD false #define IN_EVAL_J true @@ -4869,8 +4871,7 @@ namespace Tpetra { const bool verbose = verbose_; Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermute"); - double capTime = Teuchos::Time::wallTime(); - + INCL_EXP(double capTime = Teuchos::Time::wallTime()); std::unique_ptr prefix; if (verbose) { prefix = this->createPrefix("CrsGraph", "copyAndPermute"); @@ -4895,17 +4896,14 @@ namespace Tpetra { os << *prefix << "Compute padding" << endl; std::cerr << os.str (); } - double padTime = Teuchos::Time::wallTime(); + INCL_EXP(double padTime = Teuchos::Time::wallTime()); auto padding = computeCrsPadding(srcRowGraph, numSameIDs, permuteToLIDs, permuteFromLIDs, verbose); -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - if (IN_EVAL_J) Timers["capsg_G_pad"].first += -padTime + Teuchos::Time::wallTime(); -#endif - double apadTime = Teuchos::Time::wallTime(); + + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_pad"].first += -padTime + Teuchos::Time::wallTime()); + INCL_EXP(double apadTime = Teuchos::Time::wallTime()); applyCrsPadding(*padding, verbose); -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - if (IN_EVAL_J) Timers["capsg_G_apad"].first += -apadTime + Teuchos::Time::wallTime(); -#endif + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_apad"].first += -apadTime + Teuchos::Time::wallTime()); // If the source object is actually a CrsGraph, we can use view // mode instead of copy mode to access the entries in each row, @@ -4933,8 +4931,7 @@ namespace Tpetra { // compatible with the expectations of view mode. Also, if the // source graph is not a CrsGraph, we can't use view mode, // because RowGraph only provides copy mode access to the data. - double time = Teuchos::Time::wallTime(); - + INCL_EXP(double time = Teuchos::Time::wallTime()); for (size_t i = 0; i < numSameIDs; ++i, ++myid) { const GO gid = srcRowMap.getGlobalElement (myid); size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); @@ -4943,11 +4940,9 @@ namespace Tpetra { srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); this->insertGlobalIndices (gid, row_length, row_copy.data()); } -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - if (IN_EVAL_J) Timers["capsg_G_1"].first += -time + Teuchos::Time::wallTime(); -#endif + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_1"].first += -time + Teuchos::Time::wallTime()); } else { - double time = Teuchos::Time::wallTime(); + INCL_EXP(double time = Teuchos::Time::wallTime()); if (verbose) { std::ostringstream os; os << *prefix << "! src_filled && srcCrsGraph != nullptr" << endl; @@ -4959,9 +4954,7 @@ namespace Tpetra { srcCrsGraph->getGlobalRowView (gid, row); this->insertGlobalIndices (gid, row.extent(0), row.data()); } -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - if (IN_EVAL_J) Timers["capsg_G_2"].first += -time + Teuchos::Time::wallTime(); -#endif + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_2"].first += -time + Teuchos::Time::wallTime()); } // @@ -4971,7 +4964,7 @@ namespace Tpetra { auto permuteFromLIDs_h = permuteFromLIDs.view_host (); if (src_filled || srcCrsGraph == nullptr) { - double time = Teuchos::Time::wallTime(); + INCL_EXP(double time = Teuchos::Time::wallTime()); for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); @@ -4981,11 +4974,9 @@ namespace Tpetra { srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); this->insertGlobalIndices (mygid, row_length, row_copy.data()); } -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - if (IN_EVAL_J) Timers["capsg_G_3"].first += -time + Teuchos::Time::wallTime(); -#endif + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_3"].first += -time + Teuchos::Time::wallTime()); } else { - double time = Teuchos::Time::wallTime(); + INCL_EXP(double time = Teuchos::Time::wallTime()); for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); @@ -4993,9 +4984,7 @@ namespace Tpetra { srcCrsGraph->getGlobalRowView (srcgid, row); this->insertGlobalIndices (mygid, row.extent(0), row.data()); } -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - if (IN_EVAL_J) Timers["capsg_G_4"].first += -time + Teuchos::Time::wallTime(); -#endif + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_4"].first += -time + Teuchos::Time::wallTime()); } if (verbose) { @@ -5003,12 +4992,7 @@ namespace Tpetra { os << *prefix << "Done" << endl; std::cerr << os.str (); } -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - if (IN_EVAL_J) { - Timers["capsg_G"].first += -capTime + Teuchos::Time::wallTime(); - } -#endif - + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G"].first += -capTime + Teuchos::Time::wallTime()); } template @@ -5187,15 +5171,14 @@ namespace Tpetra { new padding_type(myRank, numSameIDs, permuteFromLIDs.extent(0))); - double sameTime = Teuchos::Time::wallTime(); + INCL_EXP(double sameTime = Teuchos::Time::wallTime()); computeCrsPaddingForSameIDs(*padding, source, static_cast(numSameIDs)); - Timers["capsg_G_pad_same"].first += -sameTime + Teuchos::Time::wallTime(); - - double permTime = Teuchos::Time::wallTime(); + INCL_EXP(Timers["capsg_G_pad_same"].first += -sameTime + Teuchos::Time::wallTime()); + INCL_EXP(double permTime = Teuchos::Time::wallTime()); computeCrsPaddingForPermutedIDs(*padding, source, permuteToLIDs, permuteFromLIDs); - Timers["capsg_G_pad_perm"].first += -permTime + Teuchos::Time::wallTime(); + INCL_EXP(Timers["capsg_G_pad_perm"].first += -permTime + Teuchos::Time::wallTime()); return padding; } diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 7316e467d741..8915270d1166 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -62,6 +62,7 @@ #define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 #if EXP_INCLUDED_FROM_PANXER_MINI_EM +#define INCL_EXP(a) a extern bool panzer_impl_new, panzer_impl_old; extern std::unordered_map> >& Timers; extern bool in_eval_J; @@ -70,6 +71,7 @@ extern double timer_capsg; #define PANZER_IMPL_NEW panzer_impl_new #define PANZER_IMPL_OLD panzer_impl_old #else +#define INCL_EXP(a) do {} while(0) #define PANZER_IMPL_NEW true #define PANZER_IMPL_OLD false #endif @@ -6365,7 +6367,7 @@ CrsMatrix:: const RMT& srcMat = dynamic_cast (srcObj); if (isStaticGraph ()) { if (PANZER_IMPL_NEW) { - double time_ = Teuchos::Time::wallTime(); + INCL_EXP(double time_ = Teuchos::Time::wallTime()); TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () ); auto permuteToLIDs_d = permuteToLIDs.view_device (); TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () ); @@ -6376,15 +6378,10 @@ CrsMatrix:: permuteToLIDs_d.data(), permuteFromLIDs_d.data(), numPermute); -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - if (in_eval_J) { - Timers["capsg_M"].first += -time_ + Teuchos::Time::wallTime(); - } -#endif - + INCL_EXP(if (in_eval_J) Timers["capsg_M"].first += -time_ + Teuchos::Time::wallTime()); } if (PANZER_IMPL_OLD) { - double time_ = Teuchos::Time::wallTime(); + INCL_EXP(double time_ = Teuchos::Time::wallTime()); TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () ); auto permuteToLIDs_h = permuteToLIDs.view_host (); TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () ); @@ -6394,11 +6391,7 @@ CrsMatrix:: permuteToLIDs_h.data(), permuteFromLIDs_h.data(), numPermute); -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - if (in_eval_J) { - Timers["capsg_M"].first += -time_ + Teuchos::Time::wallTime(); - } -#endif + INCL_EXP(if (in_eval_J) Timers["capsg_M"].first += -time_ + Teuchos::Time::wallTime()); } } else { @@ -7352,27 +7345,23 @@ CrsMatrix:: } if (isStaticGraph ()) { - double padTime = Teuchos::Time::wallTime(); + INCL_EXP(double padTime = Teuchos::Time::wallTime()); using Details::unpackCrsMatrixAndCombineNew; unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID, importLIDs, constantNumPackets, combineMode); -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - Timers["capsg_M_ucmac"].first += -padTime + Teuchos::Time::wallTime(); -#endif + INCL_EXP(Timers["capsg_M_ucmac"].first += -padTime + Teuchos::Time::wallTime()); } else { { using padding_type = typename crs_graph_type::padding_type; std::unique_ptr padding; try { - double padTime = Teuchos::Time::wallTime(); + INCL_EXP(double padTime = Teuchos::Time::wallTime()); padding = myGraph_->computePaddingForCrsMatrixUnpack( importLIDs, imports, numPacketsPerLID, verbose); -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - Timers["capsg_M_pad"].first += -padTime + Teuchos::Time::wallTime(); -#endif + INCL_EXP(Timers["capsg_M_pad"].first += -padTime + Teuchos::Time::wallTime()); } catch (std::exception& e) { const auto rowMap = getRowMap(); @@ -7389,11 +7378,9 @@ CrsMatrix:: os << *prefix << "Call applyCrsPadding" << endl; std::cerr << os.str(); } - double padTime = Teuchos::Time::wallTime(); + INCL_EXP(double padTime = Teuchos::Time::wallTime()); applyCrsPadding(*padding, verbose); -#if EXP_INCLUDED_FROM_PANXER_MINI_EM - Timers["capsg_M_apad"].first += -padTime + Teuchos::Time::wallTime(); -#endif + INCL_EXP(Timers["capsg_M_apad"].first += -padTime + Teuchos::Time::wallTime()); } if (verbose) { std::ostringstream os; diff --git a/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp b/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp index bfa7f526c9dd..5c9348c0ae97 100644 --- a/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp @@ -21,7 +21,23 @@ #include #include +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 +#if EXP_INCLUDED_FROM_PANXER_MINI_EM +#define INCL_EXP(a) a extern std::unordered_map> >& Timers; +extern bool panzer_impl_new, panzer_impl_old; +extern bool in_eval_J; +extern double timer_evalJ; +extern double timer_capsg; +#define PANZER_IMPL_NEW panzer_impl_new +#define PANZER_IMPL_OLD panzer_impl_old +#define IN_EVAL_J in_eval_J +#else +#define INCL_EXP(a) do {} while(0) +#define PANZER_IMPL_NEW true +#define PANZER_IMPL_OLD false +#define IN_EVAL_J true +#endif namespace Tpetra { namespace Details { @@ -175,7 +191,7 @@ namespace Tpetra { // FIXME (08 Feb 2020) We only need to sort and unique // tgtGblColInds if we haven't already seen it before. - double time_ = Teuchos::Time::wallTime(); + INCL_EXP(double time_ = Teuchos::Time::wallTime()); size_t newNumTgtEnt = origNumTgtEnt; auto tgtEnd = tgtGblColInds + origNumTgtEnt; std::sort(tgtGblColInds, tgtEnd); @@ -199,13 +215,13 @@ namespace Tpetra { newNumSrcEnt = size_t(srcEnd - srcGblColInds); TEUCHOS_ASSERT( newNumSrcEnt <= origNumSrcEnt ); } - Timers["capsg_G_pad_sort"].first += -time_ + Teuchos::Time::wallTime(); + INCL_EXP(Timers["capsg_G_pad_sort"].first += -time_ + Teuchos::Time::wallTime()); - time_ = Teuchos::Time::wallTime(); + INCL_EXP(time_ = Teuchos::Time::wallTime()); merge_with_current_state(phase, whichImport, targetLocalIndex, tgtGblColInds, newNumTgtEnt, srcGblColInds, newNumSrcEnt); - Timers["capsg_G_pad_merge"].first += -time_ + Teuchos::Time::wallTime(); + INCL_EXP(Timers["capsg_G_pad_merge"].first += -time_ + Teuchos::Time::wallTime()); if (verbose_) { std::ostringstream os; os << *prefix << "Done" << endl; From 1bab4699f375e43e7463303d2f64b650011ff13d Mon Sep 17 00:00:00 2001 From: srkenno Date: Fri, 6 Dec 2024 14:35:14 -0700 Subject: [PATCH 15/23] fix compile errors (complex data type) Signed-off-by: srkenno --- packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp | 1 + packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 4eb94e2368ad..d1d475376eb5 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -5222,6 +5222,7 @@ namespace Tpetra { std::vector tgtGblColIndsScratch; execute_sync_host_uvm_access(); // protect host UVM access + // FIXME parallel_for for (LO lclRowInd = 0; lclRowInd < numSameIDs; ++lclRowInd) { const GO srcGblRowInd = srcRowMap.getGlobalElement(lclRowInd); const GO tgtGblRowInd = tgtRowMap.getGlobalElement(lclRowInd); diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 8915270d1166..51c09f043033 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -5892,8 +5892,11 @@ CrsMatrix:: idx += 1; } } - auto inds = Kokkos::subview(indices, Kokkos::make_pair(size_t(0), idx)); - auto vals = Kokkos::subview(values, Kokkos::make_pair(size_t(0), idx)); + Kokkos::View indices_const(indices.data(), indices.size()); + const impl_scalar_type* const values_const_data = reinterpret_cast(values.data()); + Kokkos::View values_const(values_const_data, values.size()); + auto inds = Kokkos::subview(indices_const, Kokkos::make_pair(size_t(0), idx)); + auto vals = Kokkos::subview(values_const, Kokkos::make_pair(size_t(0), idx)); this->replaceLocalValues(local_row, inds, vals); } } else if (sourceIsLocallyIndexed) { @@ -9648,7 +9651,7 @@ CrsMatrix:: local_inds_device_value_t numInTgtRow = (tend - tstart); KOKKOS_ASSERT(tstart < tvals.extent(0)); - Scalar *tgtRowVals = &tvals(tstart); + impl_scalar_type *tgtRowVals = reinterpret_cast(&tvals(tstart)); const local_inds_device_value_t *tgtColInds = &tgtLocalColIndsDevice(tstart); size_t hint=0; From 56795a1cebb04b58ff8796e02cbc7ad9345a4dec Mon Sep 17 00:00:00 2001 From: srkenno Date: Thu, 12 Dec 2024 09:06:29 -0700 Subject: [PATCH 16/23] towards kokkos version of copyAndPermute --- packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp | 11 +++++++++++ packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp | 9 ++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp index 85f91bd676c2..6a089ac93b14 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp @@ -1194,6 +1194,17 @@ namespace Tpetra { buffer_device_type>& permuteFromLIDs, const CombineMode CM) override; + void + copyAndPermuteNew + (const row_graph_type& source, + row_graph_type& target, + const size_t numSameIDs, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + const CombineMode CM); + using padding_type = Details::CrsPadding< local_ordinal_type, global_ordinal_type>; diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index d1d475376eb5..a5d2c128d61a 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -4870,6 +4870,13 @@ namespace Tpetra { const char tfecfFuncName[] = "copyAndPermute: "; const bool verbose = verbose_; + if (PANZER_IMPL_NEW) { + const row_graph_type& srcRowGraph = + dynamic_cast (source); + copyAndPermuteNew(srcRowGraph, *this, numSameIDs, permuteToLIDs, permuteFromLIDs, INSERT); + return; + } + Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermute"); INCL_EXP(double capTime = Teuchos::Time::wallTime()); std::unique_ptr prefix; @@ -7662,7 +7669,7 @@ namespace Tpetra { return output; } - +#include "graphCopyAndPermuteNew.hpp" } // namespace Tpetra From 1c8d1f310547f247017bdcf4bb43b72c0123e126 Mon Sep 17 00:00:00 2001 From: srkenno Date: Thu, 12 Dec 2024 14:00:28 -0700 Subject: [PATCH 17/23] more kokkos work --- .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 2 +- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 2 +- .../core/src/Tpetra_Details_CrsPadding.hpp | 2 +- .../core/src/graphCopyAndPermuteNew.hpp | 166 ++++++++++++++++++ packages/tpetra/core/src/inner.hpp | 126 +++++++++++++ 5 files changed, 295 insertions(+), 3 deletions(-) create mode 100644 packages/tpetra/core/src/graphCopyAndPermuteNew.hpp create mode 100644 packages/tpetra/core/src/inner.hpp diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index a5d2c128d61a..820aaf456dc8 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -41,7 +41,7 @@ #include #include -#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 #if EXP_INCLUDED_FROM_PANXER_MINI_EM #define INCL_EXP(a) a extern std::unordered_map> >& Timers; diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 51c09f043033..1a79cc54c25a 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -60,7 +60,7 @@ #include #include -#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 #if EXP_INCLUDED_FROM_PANXER_MINI_EM #define INCL_EXP(a) a extern bool panzer_impl_new, panzer_impl_old; diff --git a/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp b/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp index 5c9348c0ae97..27dff1c605fe 100644 --- a/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp @@ -21,7 +21,7 @@ #include #include -#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 #if EXP_INCLUDED_FROM_PANXER_MINI_EM #define INCL_EXP(a) a extern std::unordered_map> >& Timers; diff --git a/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp b/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp new file mode 100644 index 000000000000..e3542e38407f --- /dev/null +++ b/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp @@ -0,0 +1,166 @@ +template +void +CrsGraph:: +copyAndPermuteNew( + const row_graph_type& srcRowGraph, + row_graph_type& tgtRowGraph, + const size_t numSameIDs, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + const CombineMode CM) +{ + using std::endl; + using LO = local_ordinal_type; + using GO = global_ordinal_type; + using this_CRS_type = CrsGraph; + const char tfecfFuncName[] = "copyAndPermuteNew: "; + const bool verbose = verbose_; + + Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermuteNew"); + INCL_EXP(double capTime = Teuchos::Time::wallTime()); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("CrsGraph", "copyAndPermuteNew"); + std::ostringstream os; + os << *prefix << endl; + std::cerr << os.str (); + } + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC + (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0), + std::runtime_error, "permuteToLIDs.extent(0) = " + << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = " + << permuteFromLIDs.extent (0) << "."); + + if (verbose) { + std::ostringstream os; + os << *prefix << "Compute padding" << endl; + std::cerr << os.str (); + } + + // std::cout << "here 0" << std::endl; + + using crs_graph_type = CrsGraph; + const crs_graph_type *srcCrsGraphPtr = dynamic_cast(&srcRowGraph); + if (!srcCrsGraphPtr) { + std::cout << "srk error srcGraph type= " << typeid(srcRowGraph).name() << std::endl; + std::terminate(); + } + const crs_graph_type& srcCrsGraph = *srcCrsGraphPtr; + + crs_graph_type *tgtCrsGraphPtr = dynamic_cast(&tgtRowGraph); + if (!tgtCrsGraphPtr) { + std::cout << "srk error tgtGraph type= " << typeid(tgtRowGraph).name() << std::endl; + std::terminate(); + } + crs_graph_type& tgtCrsGraph = *tgtCrsGraphPtr; + // std::cout << "here 1" << std::endl; + + INCL_EXP(double padTime = Teuchos::Time::wallTime()); + auto padding = tgtCrsGraph.computeCrsPadding(srcRowGraph, numSameIDs, + permuteToLIDs, permuteFromLIDs, verbose); + + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_pad"].first += -padTime + Teuchos::Time::wallTime()); + INCL_EXP(double apadTime = Teuchos::Time::wallTime()); + tgtCrsGraph.applyCrsPadding(*padding, verbose); + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_apad"].first += -apadTime + Teuchos::Time::wallTime()); + + const map_type& srcRowMap = *(srcRowGraph.getRowMap()); + const map_type& tgtRowMap = *(tgtRowGraph.getRowMap()); + const bool src_filled = srcRowGraph.isFillComplete(); + nonconst_global_inds_host_view_type row_copy; + LO myid = 0; + + // std::cout << "here 2" << std::endl; + + // + // "Copy" part of "copy and permute." + // + if (src_filled || srcCrsGraphPtr == nullptr) { + if (verbose) { + std::ostringstream os; + os << *prefix << "src_filled || srcCrsGraph == nullptr" << endl; + std::cerr << os.str (); + } + // If the source graph is fill complete, we can't use view mode, + // because the data might be stored in a different format not + // compatible with the expectations of view mode. Also, if the + // source graph is not a CrsGraph, we can't use view mode, + // because RowGraph only provides copy mode access to the data. + INCL_EXP(double time = Teuchos::Time::wallTime()); +#if 1 + // std::cout << "here 3" << std::endl; + + for (size_t i = 0; i < numSameIDs; ++i, ++myid) { + const GO gid = srcRowMap.getGlobalElement (myid); + size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); + Kokkos::resize(row_copy,row_length); + size_t check_row_length = 0; + srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); + tgtCrsGraph.insertGlobalIndices (gid, row_length, row_copy.data()); + } + // std::cout << "here 4" << std::endl; + +#else + + +#include "inner.hpp" + +#endif + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_1"].first += -time + Teuchos::Time::wallTime()); + } else { + INCL_EXP(double time = Teuchos::Time::wallTime()); + if (verbose) { + std::ostringstream os; + os << *prefix << "! src_filled && srcCrsGraph != nullptr" << endl; + std::cerr << os.str (); + } + for (size_t i = 0; i < numSameIDs; ++i, ++myid) { + const GO gid = srcRowMap.getGlobalElement (myid); + global_inds_host_view_type row; + srcCrsGraph.getGlobalRowView (gid, row); + tgtCrsGraph.insertGlobalIndices (gid, row.extent(0), row.data()); + } + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_2"].first += -time + Teuchos::Time::wallTime()); + } + + // + // "Permute" part of "copy and permute." + // + auto permuteToLIDs_h = permuteToLIDs.view_host (); + auto permuteFromLIDs_h = permuteFromLIDs.view_host (); + + if (src_filled || srcCrsGraphPtr == nullptr) { + INCL_EXP(double time = Teuchos::Time::wallTime()); + for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { + const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); + const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); + size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (srcgid); + Kokkos::resize(row_copy,row_length); + size_t check_row_length = 0; + srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); + tgtCrsGraph.insertGlobalIndices (mygid, row_length, row_copy.data()); + } + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_3"].first += -time + Teuchos::Time::wallTime()); + } else { + INCL_EXP(double time = Teuchos::Time::wallTime()); + for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { + const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); + const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); + global_inds_host_view_type row; + srcCrsGraph.getGlobalRowView (srcgid, row); + tgtCrsGraph.insertGlobalIndices (mygid, row.extent(0), row.data()); + } + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_4"].first += -time + Teuchos::Time::wallTime()); + } + + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G"].first += -capTime + Teuchos::Time::wallTime()); +} + diff --git a/packages/tpetra/core/src/inner.hpp b/packages/tpetra/core/src/inner.hpp new file mode 100644 index 000000000000..27ae52015daa --- /dev/null +++ b/packages/tpetra/core/src/inner.hpp @@ -0,0 +1,126 @@ +// const GO gid = srcRowMapLocal.getGlobalElement (sourceLID); +// size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); +// Kokkos::resize(row_copy,row_length); +// size_t check_row_length = 0; +// srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); +// tgtCrsGraph.insertGlobalIndices (gid, row_length, row_copy.data()); + +// using RowInfoViewType = Kokkos::View; +// RowInfoViewType tgtRowInfoView("RowInfoView", numSameIDs); +// kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew1", +// range_type (0, numSameIDs_as_LID), +// KOKKOS_LAMBDA(const LO sourceLID) { +// tgtRowInfoView(sourceLID) = tgtGraphDevice.rowConst(sourceLID); +// } +// ); + + std::cout << "here 0" << std::endl; + +using local_inds_device_value_t = LocalOrdinal; +using row_ptrs_device_value_t = size_t; +typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; +typedef typename Node::execution_space exec_space; +typedef Kokkos::RangePolicy range_type; + +const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); +// typedef typename crs_graph_type::global_inds_device_view_type::non_const_value_type global_inds_device_value_t; +// typedef typename crs_graph_type::row_ptrs_device_view_type::non_const_value_type row_ptrs_device_value_t; +typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; +typedef typename Node::execution_space exec_space; +typedef Kokkos::RangePolicy range_type; +typedef typename Kokkos::GraphRowViewConst graph_row_view_const_type; + +const k_local_graph_device_type & srcGraphDevice = srcCrsGraph.getLocalGraphDevice(); +const k_local_graph_device_type & tgtGraphDevice = tgtCrsGraph.getLocalGraphDevice(); + std::cout << "here 1" << std::endl; + +using local_map_type = typename crs_graph_type::map_type::local_map_type; + +local_map_type srcRowMapLocal = srcCrsGraph.getRowMap()->getLocalMap(); + std::cout << "here 2" << std::endl; +local_map_type srcColMapLocal = srcCrsGraph.getColMap()->getLocalMap(); + std::cout << "here 3" << std::endl; + +local_map_type tgtRowMapLocal = tgtCrsGraph.getRowMap()->getLocalMap(); + std::cout << "here 4" << std::endl; + +local_map_type tgtColMapLocal = tgtCrsGraph.getColMap()->getLocalMap(); + std::cout << "here 5" << std::endl; + +auto tgtLocalRowPtrsDevice = tgtCrsGraph.getLocalRowPtrsDevice(); +auto tgtLocalColIndsDevice = tgtCrsGraph.getLocalIndicesDevice(); +auto tgtLocalColIndsDeviceNonConst = tgtCrsGraph.lclIndsUnpacked_wdv.getDeviceView(Access::ReadWrite); +//auto srcLocalRowPtrsHost = srcCrsGraph.getLocalRowPtrsHost(); +auto srcLocalRowPtrsDevice = srcCrsGraph.getLocalRowPtrsDevice(); +auto srcLocalColIndsDevice = srcCrsGraph.getLocalIndicesDevice(); + std::cout << "here 7" << std::endl; + +typedef typename Node::execution_space exec_space; +typedef Kokkos::RangePolicy range_type; +typename num_row_entries_type::non_const_type h_numRowEnt = tgtCrsGraph.k_numRowEntries_; + std::cout << "here 8" << std::endl; + +auto k_numRowEnt = Kokkos::create_mirror_view_and_copy (device_type (), h_numRowEnt); + + std::cout << "here 9" << std::endl; + +LO numSameIDs_as_LID = static_cast(numSameIDs); +const bool sorted = false; + +Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", + range_type (0, numSameIDs_as_LID), + KOKKOS_LAMBDA(const LO sourceLID) + { + auto tgtRowInfo = tgtGraphDevice.rowConst(sourceLID); + auto tgtNumEntries = tgtRowInfo.length; + + auto start = srcLocalRowPtrsDevice(sourceLID); + auto end = srcLocalRowPtrsDevice(sourceLID+1); + auto rowLength = (end - start); + + //KOKKOS_ASSERT(rowLength <= max_row_entries); + + auto tstart = tgtLocalRowPtrsDevice(sourceLID); + auto tend = tstart + tgtNumEntries; + auto tend1 = tgtLocalRowPtrsDevice(sourceLID + 1); + + const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; + const size_t num_new_indices = rowLength; + size_t num_inserted = 0; + + local_inds_device_value_t *tgtColInds = tgtLocalColIndsDeviceNonConst.data()+tstart; + + size_t hint=0; + for (LO j = 0; j < rowLength; j++) { + auto ci = srcLocalColIndsDevice(start + j); + GO gi = srcColMapLocal.getGlobalElement(ci); + const auto lclColInd = tgtColMapLocal.getLocalElement(gi); + + auto numInTgtRow = (tend - tstart); + + const size_t offset = + KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + lclColInd, hint, sorted); + + if (offset == numInTgtRow) { + if (num_inserted >= num_avail) { // not enough room + return Teuchos::OrdinalTraits::invalid(); + } + //Kokkos::atomic_store (&tgtRowVals[offset], newVals); + tgtColInds[tend] = lclColInd; + ++tend; + hint = offset + 1; + ++num_inserted; + } + k_numRowEnt(sourceLID) += num_inserted; + + } + return size_t(0); + }); + std::cout << "here 10" << std::endl; + +Kokkos::deep_copy(exec_space(), tgtCrsGraph.k_numRowEntries_, k_numRowEnt); + std::cout << "here 11" << std::endl; + +tgtCrsGraph.setLocallyModified(); + std::cout << "here 12" << std::endl; From c64ff95fdfff1e099b8ead713f2492b6c83feae9 Mon Sep 17 00:00:00 2001 From: srkenno Date: Thu, 12 Dec 2024 15:01:56 -0700 Subject: [PATCH 18/23] more kokkos: compiles but segfaults --- .../core/src/graphCopyAndPermuteNew.hpp | 2 +- packages/tpetra/core/src/inner.hpp | 32 +++++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp b/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp index e3542e38407f..5a1f0ffa6184 100644 --- a/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp +++ b/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp @@ -90,7 +90,7 @@ copyAndPermuteNew( // source graph is not a CrsGraph, we can't use view mode, // because RowGraph only provides copy mode access to the data. INCL_EXP(double time = Teuchos::Time::wallTime()); -#if 1 +#if 0 // std::cout << "here 3" << std::endl; for (size_t i = 0; i < numSameIDs; ++i, ++myid) { diff --git a/packages/tpetra/core/src/inner.hpp b/packages/tpetra/core/src/inner.hpp index 27ae52015daa..02f590603e4b 100644 --- a/packages/tpetra/core/src/inner.hpp +++ b/packages/tpetra/core/src/inner.hpp @@ -16,13 +16,15 @@ std::cout << "here 0" << std::endl; -using local_inds_device_value_t = LocalOrdinal; +using global_inds_device_value_t = GlobalOrdinal; using row_ptrs_device_value_t = size_t; typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; typedef typename Node::execution_space exec_space; typedef Kokkos::RangePolicy range_type; const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); +const GlobalOrdinal GINV = Teuchos::OrdinalTraits::invalid (); + // typedef typename crs_graph_type::global_inds_device_view_type::non_const_value_type global_inds_device_value_t; // typedef typename crs_graph_type::row_ptrs_device_view_type::non_const_value_type row_ptrs_device_value_t; typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; @@ -43,13 +45,14 @@ local_map_type srcColMapLocal = srcCrsGraph.getColMap()->getLocalMap(); local_map_type tgtRowMapLocal = tgtCrsGraph.getRowMap()->getLocalMap(); std::cout << "here 4" << std::endl; - -local_map_type tgtColMapLocal = tgtCrsGraph.getColMap()->getLocalMap(); - std::cout << "here 5" << std::endl; +// std::cout << "here 4.1 " << tgtCrsGraph.getColMap() << std::endl; +// local_map_type tgtColMapLocal = tgtCrsGraph.getColMap()->getLocalMap(); +// std::cout << "here 5" << std::endl; auto tgtLocalRowPtrsDevice = tgtCrsGraph.getLocalRowPtrsDevice(); auto tgtLocalColIndsDevice = tgtCrsGraph.getLocalIndicesDevice(); auto tgtLocalColIndsDeviceNonConst = tgtCrsGraph.lclIndsUnpacked_wdv.getDeviceView(Access::ReadWrite); +auto tgtGlobalColInds = tgtCrsGraph.gblInds_wdv.getDeviceView(Access::ReadWrite); //auto srcLocalRowPtrsHost = srcCrsGraph.getLocalRowPtrsHost(); auto srcLocalRowPtrsDevice = srcCrsGraph.getLocalRowPtrsDevice(); auto srcLocalColIndsDevice = srcCrsGraph.getLocalIndicesDevice(); @@ -58,7 +61,7 @@ auto srcLocalColIndsDevice = srcCrsGraph.getLocalIndicesDevice(); typedef typename Node::execution_space exec_space; typedef Kokkos::RangePolicy range_type; typename num_row_entries_type::non_const_type h_numRowEnt = tgtCrsGraph.k_numRowEntries_; - std::cout << "here 8" << std::endl; +std::cout << "here 8: " << h_numRowEnt.extent(0) << " numSameIDs= " << numSameIDs << std::endl; auto k_numRowEnt = Kokkos::create_mirror_view_and_copy (device_type (), h_numRowEnt); @@ -71,7 +74,9 @@ Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", range_type (0, numSameIDs_as_LID), KOKKOS_LAMBDA(const LO sourceLID) { - auto tgtRowInfo = tgtGraphDevice.rowConst(sourceLID); + auto srcGid = srcRowMapLocal.getGlobalElement(sourceLID); + auto tgtLocalRow = tgtRowMapLocal.getLocalElement(srcGid); + auto tgtRowInfo = tgtGraphDevice.rowConst(tgtLocalRow); auto tgtNumEntries = tgtRowInfo.length; auto start = srcLocalRowPtrsDevice(sourceLID); @@ -80,39 +85,40 @@ Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", //KOKKOS_ASSERT(rowLength <= max_row_entries); - auto tstart = tgtLocalRowPtrsDevice(sourceLID); + auto tstart = tgtLocalRowPtrsDevice(tgtLocalRow); auto tend = tstart + tgtNumEntries; - auto tend1 = tgtLocalRowPtrsDevice(sourceLID + 1); + auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; const size_t num_new_indices = rowLength; size_t num_inserted = 0; - local_inds_device_value_t *tgtColInds = tgtLocalColIndsDeviceNonConst.data()+tstart; + //local_inds_device_value_t *tgtColInds = tgtLocalColIndsDeviceNonConst.data()+tstart; + global_inds_device_value_t *tgtColInds = tgtGlobalColInds.data()+tstart; size_t hint=0; for (LO j = 0; j < rowLength; j++) { auto ci = srcLocalColIndsDevice(start + j); GO gi = srcColMapLocal.getGlobalElement(ci); - const auto lclColInd = tgtColMapLocal.getLocalElement(gi); + //const auto lclColInd = tgtColMapLocal.getLocalElement(gi); auto numInTgtRow = (tend - tstart); const size_t offset = KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, - lclColInd, hint, sorted); + gi, hint, sorted); if (offset == numInTgtRow) { if (num_inserted >= num_avail) { // not enough room return Teuchos::OrdinalTraits::invalid(); } //Kokkos::atomic_store (&tgtRowVals[offset], newVals); - tgtColInds[tend] = lclColInd; + tgtColInds[tend] = gi; ++tend; hint = offset + 1; ++num_inserted; } - k_numRowEnt(sourceLID) += num_inserted; + k_numRowEnt(tgtLocalRow) += num_inserted; } return size_t(0); From 8387e30060929624525e1c60fd763c41dd233497 Mon Sep 17 00:00:00 2001 From: srkenno Date: Fri, 13 Dec 2024 10:35:03 -0700 Subject: [PATCH 19/23] no longer segfaulting --- packages/tpetra/core/src/inner.hpp | 90 ++++++++++++++---------------- 1 file changed, 43 insertions(+), 47 deletions(-) diff --git a/packages/tpetra/core/src/inner.hpp b/packages/tpetra/core/src/inner.hpp index 02f590603e4b..654a326482e1 100644 --- a/packages/tpetra/core/src/inner.hpp +++ b/packages/tpetra/core/src/inner.hpp @@ -5,17 +5,6 @@ // srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); // tgtCrsGraph.insertGlobalIndices (gid, row_length, row_copy.data()); -// using RowInfoViewType = Kokkos::View; -// RowInfoViewType tgtRowInfoView("RowInfoView", numSameIDs); -// kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew1", -// range_type (0, numSameIDs_as_LID), -// KOKKOS_LAMBDA(const LO sourceLID) { -// tgtRowInfoView(sourceLID) = tgtGraphDevice.rowConst(sourceLID); -// } -// ); - - std::cout << "here 0" << std::endl; - using global_inds_device_value_t = GlobalOrdinal; using row_ptrs_device_value_t = size_t; typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; @@ -25,8 +14,6 @@ typedef Kokkos::RangePolicy range_type; const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); const GlobalOrdinal GINV = Teuchos::OrdinalTraits::invalid (); -// typedef typename crs_graph_type::global_inds_device_view_type::non_const_value_type global_inds_device_value_t; -// typedef typename crs_graph_type::row_ptrs_device_view_type::non_const_value_type row_ptrs_device_value_t; typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; typedef typename Node::execution_space exec_space; typedef Kokkos::RangePolicy range_type; @@ -34,98 +21,107 @@ typedef typename Kokkos::GraphRowViewConst graph_row_vi const k_local_graph_device_type & srcGraphDevice = srcCrsGraph.getLocalGraphDevice(); const k_local_graph_device_type & tgtGraphDevice = tgtCrsGraph.getLocalGraphDevice(); - std::cout << "here 1" << std::endl; using local_map_type = typename crs_graph_type::map_type::local_map_type; - local_map_type srcRowMapLocal = srcCrsGraph.getRowMap()->getLocalMap(); - std::cout << "here 2" << std::endl; local_map_type srcColMapLocal = srcCrsGraph.getColMap()->getLocalMap(); - std::cout << "here 3" << std::endl; - local_map_type tgtRowMapLocal = tgtCrsGraph.getRowMap()->getLocalMap(); - std::cout << "here 4" << std::endl; -// std::cout << "here 4.1 " << tgtCrsGraph.getColMap() << std::endl; -// local_map_type tgtColMapLocal = tgtCrsGraph.getColMap()->getLocalMap(); -// std::cout << "here 5" << std::endl; - -auto tgtLocalRowPtrsDevice = tgtCrsGraph.getLocalRowPtrsDevice(); -auto tgtLocalColIndsDevice = tgtCrsGraph.getLocalIndicesDevice(); -auto tgtLocalColIndsDeviceNonConst = tgtCrsGraph.lclIndsUnpacked_wdv.getDeviceView(Access::ReadWrite); + +auto tgtLocalRowPtrsDevice = tgtCrsGraph.getRowPtrsUnpackedDevice(); auto tgtGlobalColInds = tgtCrsGraph.gblInds_wdv.getDeviceView(Access::ReadWrite); -//auto srcLocalRowPtrsHost = srcCrsGraph.getLocalRowPtrsHost(); auto srcLocalRowPtrsDevice = srcCrsGraph.getLocalRowPtrsDevice(); -auto srcLocalColIndsDevice = srcCrsGraph.getLocalIndicesDevice(); - std::cout << "here 7" << std::endl; +auto srcLocalColIndsDevice = srcCrsGraph.lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly); typedef typename Node::execution_space exec_space; typedef Kokkos::RangePolicy range_type; typename num_row_entries_type::non_const_type h_numRowEnt = tgtCrsGraph.k_numRowEntries_; -std::cout << "here 8: " << h_numRowEnt.extent(0) << " numSameIDs= " << numSameIDs << std::endl; auto k_numRowEnt = Kokkos::create_mirror_view_and_copy (device_type (), h_numRowEnt); - - std::cout << "here 9" << std::endl; - LO numSameIDs_as_LID = static_cast(numSameIDs); const bool sorted = false; +#define CHECK(a,i) do { \ + if (i >= a.extent(0) || i < 0) { \ + char buf[100]; \ + sprintf(buf,"ERROR: i= %d a= %s e= %d", i, #a, a.extent(0)); \ + Kokkos::abort(buf); \ + } } while(0) + +#define AB(lin) do { \ + char buf[100]; \ + sprintf(buf,"ERROR: line= %d", lin); \ + Kokkos::abort(buf); \ + } while(0) + Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", range_type (0, numSameIDs_as_LID), KOKKOS_LAMBDA(const LO sourceLID) { auto srcGid = srcRowMapLocal.getGlobalElement(sourceLID); auto tgtLocalRow = tgtRowMapLocal.getLocalElement(srcGid); - auto tgtRowInfo = tgtGraphDevice.rowConst(tgtLocalRow); - auto tgtNumEntries = tgtRowInfo.length; - + if (tgtLocalRow == LINV) { + AB(__LINE__); + } + CHECK(k_numRowEnt, tgtLocalRow); + auto tgtNumEntries = k_numRowEnt(tgtLocalRow); + + // FIXME no auto use + CHECK(srcLocalRowPtrsDevice, sourceLID); auto start = srcLocalRowPtrsDevice(sourceLID); + CHECK(srcLocalRowPtrsDevice, sourceLID+1); auto end = srcLocalRowPtrsDevice(sourceLID+1); auto rowLength = (end - start); //KOKKOS_ASSERT(rowLength <= max_row_entries); + CHECK(tgtLocalRowPtrsDevice, tgtLocalRow); auto tstart = tgtLocalRowPtrsDevice(tgtLocalRow); auto tend = tstart + tgtNumEntries; + CHECK(tgtLocalRowPtrsDevice, tgtLocalRow + 1); auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; const size_t num_new_indices = rowLength; size_t num_inserted = 0; - //local_inds_device_value_t *tgtColInds = tgtLocalColIndsDeviceNonConst.data()+tstart; - global_inds_device_value_t *tgtColInds = tgtGlobalColInds.data()+tstart; + CHECK(tgtGlobalColInds, tstart); + //global_inds_device_value_t *tgtColInds = tgtGlobalColInds.data()+tstart; + global_inds_device_value_t *tgtGlobalColIndsPtr = tgtGlobalColInds.data(); size_t hint=0; for (LO j = 0; j < rowLength; j++) { auto ci = srcLocalColIndsDevice(start + j); GO gi = srcColMapLocal.getGlobalElement(ci); - //const auto lclColInd = tgtColMapLocal.getLocalElement(gi); - auto numInTgtRow = (tend - tstart); const size_t offset = - KokkosSparse::findRelOffset (tgtColInds, numInTgtRow, + KokkosSparse::findRelOffset (tgtGlobalColIndsPtr+tstart, + numInTgtRow, gi, hint, sorted); if (offset == numInTgtRow) { if (num_inserted >= num_avail) { // not enough room - return Teuchos::OrdinalTraits::invalid(); + //return Teuchos::OrdinalTraits::invalid(); + Kokkos::abort("num_avail"); } //Kokkos::atomic_store (&tgtRowVals[offset], newVals); - tgtColInds[tend] = gi; + tgtGlobalColIndsPtr[tstart + offset] = gi; ++tend; hint = offset + 1; ++num_inserted; } - k_numRowEnt(tgtLocalRow) += num_inserted; + } + k_numRowEnt(tgtLocalRow) += num_inserted; - } return size_t(0); }); - std::cout << "here 10" << std::endl; -Kokkos::deep_copy(exec_space(), tgtCrsGraph.k_numRowEntries_, k_numRowEnt); +std::cout << "here 10 host size= " << tgtCrsGraph.k_numRowEntries_.extent(0) << " dev: " << k_numRowEnt.extent(0) << std::endl; + +Kokkos::fence("here 10"); + std::cout << "here 10.1" << std::endl; + +Kokkos::deep_copy(tgtCrsGraph.k_numRowEntries_, k_numRowEnt); std::cout << "here 11" << std::endl; tgtCrsGraph.setLocallyModified(); From 0d52dece2352fd0c523e21f1e4e81b466b2657fd Mon Sep 17 00:00:00 2001 From: srkenno Date: Fri, 13 Dec 2024 11:09:00 -0700 Subject: [PATCH 20/23] CrsGraph;;copyAndPermute is now running partially on device, nice speedup --- .../panzer/mini-em/example/BlockPrec/main.cpp | 2 +- packages/tpetra/core/src/inner.hpp | 39 +++++++++++-------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp index 78897f9db6fb..c3e1a016214e 100644 --- a/packages/panzer/mini-em/example/BlockPrec/main.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp @@ -969,7 +969,7 @@ int main(int argc,char * argv[]){ } if (!comm->getRank()) { for (auto& t : Timers) { - std::cout << "[TIMER] repeat= " << repeat << " " << t.first << " = " << t.second.first << std::endl; + //std::cout << "[TIMER] repeat= " << repeat << " " << t.first << " = " << t.second.first << std::endl; if (t.second.second.size() != numRepeatRuns) t.second.second.resize(numRepeatRuns); t.second.second[repeat] = t.second.first; } diff --git a/packages/tpetra/core/src/inner.hpp b/packages/tpetra/core/src/inner.hpp index 654a326482e1..808bdf3ac132 100644 --- a/packages/tpetra/core/src/inner.hpp +++ b/packages/tpetra/core/src/inner.hpp @@ -40,14 +40,26 @@ auto k_numRowEnt = Kokkos::create_mirror_view_and_copy (device_type (), h_numRow LO numSameIDs_as_LID = static_cast(numSameIDs); const bool sorted = false; +#ifdef PANZER_DO_CHECK_INNER_HPP +#undef PANZER_DO_CHECK_INNER_HPP +#endif +#define PANZER_DO_CHECK_INNER_HPP 1 +#if PANZER_DO_CHECK_INNER_HPP #define CHECK(a,i) do { \ - if (i >= a.extent(0) || i < 0) { \ + if ((int)(i) >= (int)a.extent(0)) { \ char buf[100]; \ - sprintf(buf,"ERROR: i= %d a= %s e= %d", i, #a, a.extent(0)); \ + sprintf(buf,"ERROR: i= %d a= %s e= %d", (int)(i), #a, (int)a.extent(0)); \ Kokkos::abort(buf); \ } } while(0) +#else +#define CHECK(a,i) do { } while(0) +#endif -#define AB(lin) do { \ +#ifdef PANZER_INNER_ABORT +#undef PANZER_INNER_ABORT +#endif + +#define PANZER_INNER_ABORT(lin) do { \ char buf[100]; \ sprintf(buf,"ERROR: line= %d", lin); \ Kokkos::abort(buf); \ @@ -58,10 +70,10 @@ Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", KOKKOS_LAMBDA(const LO sourceLID) { auto srcGid = srcRowMapLocal.getGlobalElement(sourceLID); + if (srcGid == GINV) PANZER_INNER_ABORT(__LINE__); auto tgtLocalRow = tgtRowMapLocal.getLocalElement(srcGid); - if (tgtLocalRow == LINV) { - AB(__LINE__); - } + if (tgtLocalRow == LINV) PANZER_INNER_ABORT(__LINE__); + CHECK(k_numRowEnt, tgtLocalRow); auto tgtNumEntries = k_numRowEnt(tgtLocalRow); @@ -81,17 +93,17 @@ Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; - const size_t num_new_indices = rowLength; size_t num_inserted = 0; CHECK(tgtGlobalColInds, tstart); - //global_inds_device_value_t *tgtColInds = tgtGlobalColInds.data()+tstart; global_inds_device_value_t *tgtGlobalColIndsPtr = tgtGlobalColInds.data(); size_t hint=0; - for (LO j = 0; j < rowLength; j++) { + for (size_t j = 0; j < rowLength; j++) { + CHECK(srcLocalColIndsDevice, start + j); auto ci = srcLocalColIndsDevice(start + j); GO gi = srcColMapLocal.getGlobalElement(ci); + if (gi == GINV) PANZER_INNER_ABORT(__LINE__); auto numInTgtRow = (tend - tstart); const size_t offset = @@ -105,24 +117,19 @@ Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", Kokkos::abort("num_avail"); } //Kokkos::atomic_store (&tgtRowVals[offset], newVals); + CHECK(tgtGlobalColInds, tstart + offset); tgtGlobalColIndsPtr[tstart + offset] = gi; ++tend; hint = offset + 1; ++num_inserted; } } + CHECK(k_numRowEnt, tgtLocalRow); k_numRowEnt(tgtLocalRow) += num_inserted; return size_t(0); }); -std::cout << "here 10 host size= " << tgtCrsGraph.k_numRowEntries_.extent(0) << " dev: " << k_numRowEnt.extent(0) << std::endl; - Kokkos::fence("here 10"); - std::cout << "here 10.1" << std::endl; - Kokkos::deep_copy(tgtCrsGraph.k_numRowEntries_, k_numRowEnt); - std::cout << "here 11" << std::endl; - tgtCrsGraph.setLocallyModified(); - std::cout << "here 12" << std::endl; From 393684e37b4c6d2258225540434cce4ab239ac25 Mon Sep 17 00:00:00 2001 From: srkenno Date: Mon, 16 Dec 2024 13:03:01 -0700 Subject: [PATCH 21/23] cleanup; implement permute phase for isFillComplete --- .../tpetra/core/src/Tpetra_CrsGraph_decl.hpp | 9 + .../core/src/graphCopyAndPermuteNew.hpp | 183 ++++++++++++++++-- 2 files changed, 175 insertions(+), 17 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp index 6a089ac93b14..01c1f3ac6d2b 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp @@ -1194,6 +1194,15 @@ namespace Tpetra { buffer_device_type>& permuteFromLIDs, const CombineMode CM) override; + + void + insertGlobalIndicesDevice + (const CrsGraph& srcCrsGraph, + CrsGraph& tgtCrsGraph, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + LocalOrdinal loopEnd); + void copyAndPermuteNew (const row_graph_type& source, diff --git a/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp b/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp index 5a1f0ffa6184..23784f19cab3 100644 --- a/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp +++ b/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp @@ -1,8 +1,152 @@ template void CrsGraph:: -copyAndPermuteNew( - const row_graph_type& srcRowGraph, +insertGlobalIndicesDevice(const CrsGraph& srcCrsGraph, + CrsGraph& tgtCrsGraph, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + LocalOrdinal loopEnd) +{ + using crs_graph_type = CrsGraph; + using LO = LocalOrdinal; + using GO = GlobalOrdinal; + typedef typename crs_graph_type::global_inds_device_view_type::non_const_value_type global_inds_device_value_t; + typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; + typedef typename Node::execution_space exec_space; + typedef Kokkos::RangePolicy range_type; + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + const GlobalOrdinal GINV = Teuchos::OrdinalTraits::invalid (); + + const k_local_graph_device_type & srcGraphDevice = srcCrsGraph.getLocalGraphDevice(); + const k_local_graph_device_type & tgtGraphDevice = tgtCrsGraph.getLocalGraphDevice(); + + using local_map_type = typename crs_graph_type::map_type::local_map_type; + local_map_type srcRowMapLocal = srcCrsGraph.getRowMap()->getLocalMap(); + local_map_type srcColMapLocal = srcCrsGraph.getColMap()->getLocalMap(); + local_map_type tgtRowMapLocal = tgtCrsGraph.getRowMap()->getLocalMap(); + + auto tgtLocalRowPtrsDevice = tgtCrsGraph.getRowPtrsUnpackedDevice(); + auto tgtGlobalColInds = tgtCrsGraph.gblInds_wdv.getDeviceView(Access::ReadWrite); + auto srcLocalRowPtrsDevice = srcCrsGraph.getLocalRowPtrsDevice(); + auto srcLocalColIndsDevice = srcCrsGraph.lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly); + + typename crs_graph_type::num_row_entries_type::non_const_type h_numRowEnt = tgtCrsGraph.k_numRowEntries_; + + auto k_numRowEnt = Kokkos::create_mirror_view_and_copy (device_type (), h_numRowEnt); + + const bool sorted = false; + + bool hasMap = permuteFromLIDs.extent(0) > 0; + auto permuteToLIDs_d = permuteToLIDs.view_device (); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + +#ifdef PANZER_DO_CHECK_INNER_HPP +#undef PANZER_DO_CHECK_INNER_HPP +#endif +#define PANZER_DO_CHECK_INNER_HPP 0 +#if PANZER_DO_CHECK_INNER_HPP +#define CHECK(a,i) do { \ + if ((int)(i) >= (int)a.extent(0)) { \ + printf("ERROR: i= %d a= %s e= %d", (int)(i), #a, (int)a.extent(0)); \ + Kokkos::abort("inding error"); \ + } } while(0) +#else +#define CHECK(a,i) do { } while(0) +#endif + +#ifdef PANZER_INNER_ABORT +#undef PANZER_INNER_ABORT +#endif + +#define PANZER_INNER_ABORT(lin) do { \ + printf("ERROR: line= %d", lin); \ + Kokkos::abort("error"); \ + } while(0) + + Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", + range_type (0, loopEnd), + KOKKOS_LAMBDA(const LO sourceLID) + { + auto srcLid = sourceLID; + auto tgtLid = sourceLID; + if (hasMap) { + srcLid = permuteFromLIDs_d(srcLid); + tgtLid = permuteToLIDs_d(tgtLid); + } + auto srcGid = srcRowMapLocal.getGlobalElement(srcLid); + if (srcGid == GINV) PANZER_INNER_ABORT(__LINE__); + auto tgtGid = tgtRowMapLocal.getGlobalElement(tgtLid); + + auto tgtLocalRow = tgtRowMapLocal.getLocalElement(tgtGid); + if (tgtLocalRow == LINV) PANZER_INNER_ABORT(__LINE__); + if (tgtLocalRow != tgtLid) PANZER_INNER_ABORT(__LINE__); + CHECK(k_numRowEnt, tgtLocalRow); + auto tgtNumEntries = k_numRowEnt(tgtLocalRow); + + // FIXME no auto use + CHECK(srcLocalRowPtrsDevice, srcLid); + auto start = srcLocalRowPtrsDevice(srcLid); + CHECK(srcLocalRowPtrsDevice, srcLid + 1); + auto end = srcLocalRowPtrsDevice(srcLid + 1); + auto rowLength = (end - start); + + //KOKKOS_ASSERT(rowLength <= max_row_entries); + + CHECK(tgtLocalRowPtrsDevice, tgtLocalRow); + auto tstart = tgtLocalRowPtrsDevice(tgtLocalRow); + auto tend = tstart + tgtNumEntries; + CHECK(tgtLocalRowPtrsDevice, tgtLocalRow + 1); + auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); + + const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; + size_t num_inserted = 0; + + CHECK(tgtGlobalColInds, tstart); + global_inds_device_value_t *tgtGlobalColIndsPtr = tgtGlobalColInds.data(); + + size_t hint=0; + for (size_t j = 0; j < rowLength; j++) { + CHECK(srcLocalColIndsDevice, start + j); + auto ci = srcLocalColIndsDevice(start + j); + GO gi = srcColMapLocal.getGlobalElement(ci); + if (gi == GINV) PANZER_INNER_ABORT(__LINE__); + auto numInTgtRow = (tend - tstart); + + const size_t offset = + KokkosSparse::findRelOffset (tgtGlobalColIndsPtr+tstart, + numInTgtRow, + gi, hint, sorted); + + if (offset == numInTgtRow) { + if (num_inserted >= num_avail) { // not enough room + //return Teuchos::OrdinalTraits::invalid(); + Kokkos::abort("num_avail"); + } + //Kokkos::atomic_store (&tgtRowVals[offset], newVals); + CHECK(tgtGlobalColInds, tstart + offset); + tgtGlobalColIndsPtr[tstart + offset] = gi; + ++tend; + hint = offset + 1; + ++num_inserted; + } + } + CHECK(k_numRowEnt, tgtLocalRow); + k_numRowEnt(tgtLocalRow) += num_inserted; + + return size_t(0); + }); + + Kokkos::fence("here 10"); + Kokkos::deep_copy(tgtCrsGraph.k_numRowEntries_, k_numRowEnt); + tgtCrsGraph.setLocallyModified(); +} + + +template +void +CrsGraph:: +copyAndPermuteNew(const row_graph_type& srcRowGraph, row_graph_type& tgtRowGraph, const size_t numSameIDs, const Kokkos::DualView; const crs_graph_type *srcCrsGraphPtr = dynamic_cast(&srcRowGraph); - if (!srcCrsGraphPtr) { - std::cout << "srk error srcGraph type= " << typeid(srcRowGraph).name() << std::endl; - std::terminate(); - } + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!srcCrsGraphPtr, std::runtime_error, + "error srcGraph type= " << typeid(srcRowGraph).name()); const crs_graph_type& srcCrsGraph = *srcCrsGraphPtr; crs_graph_type *tgtCrsGraphPtr = dynamic_cast(&tgtRowGraph); - if (!tgtCrsGraphPtr) { - std::cout << "srk error tgtGraph type= " << typeid(tgtRowGraph).name() << std::endl; - std::terminate(); - } + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!srcCrsGraphPtr, std::runtime_error, + "error tgtGraph type= " << typeid(tgtRowGraph).name()); + crs_graph_type& tgtCrsGraph = *tgtCrsGraphPtr; - // std::cout << "here 1" << std::endl; INCL_EXP(double padTime = Teuchos::Time::wallTime()); auto padding = tgtCrsGraph.computeCrsPadding(srcRowGraph, numSameIDs, @@ -73,11 +211,12 @@ copyAndPermuteNew( nonconst_global_inds_host_view_type row_copy; LO myid = 0; - // std::cout << "here 2" << std::endl; - // // "Copy" part of "copy and permute." // + LO numSameIDs_as_LID = static_cast(numSameIDs); + using LidMapType = std::function ; + if (src_filled || srcCrsGraphPtr == nullptr) { if (verbose) { std::ostringstream os; @@ -105,8 +244,10 @@ copyAndPermuteNew( #else - -#include "inner.hpp" + Kokkos::DualView noPermute; + insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, + noPermute, noPermute, + numSameIDs_as_LID); #endif INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_1"].first += -time + Teuchos::Time::wallTime()); @@ -131,9 +272,12 @@ copyAndPermuteNew( // auto permuteToLIDs_h = permuteToLIDs.view_host (); auto permuteFromLIDs_h = permuteFromLIDs.view_host (); + auto permuteToLIDs_d = permuteToLIDs.view_device (); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); if (src_filled || srcCrsGraphPtr == nullptr) { INCL_EXP(double time = Teuchos::Time::wallTime()); +#if 0 for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); @@ -143,6 +287,11 @@ copyAndPermuteNew( srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); tgtCrsGraph.insertGlobalIndices (mygid, row_length, row_copy.data()); } +#else + insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, + permuteToLIDs, permuteFromLIDs, // note reversed arg order, tgt, then src + static_cast (permuteToLIDs_h.extent (0))); +#endif INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_3"].first += -time + Teuchos::Time::wallTime()); } else { INCL_EXP(double time = Teuchos::Time::wallTime()); From 067a50bb226681db9c95409806d0b77cd8811827 Mon Sep 17 00:00:00 2001 From: srkenno Date: Mon, 16 Dec 2024 15:04:33 -0700 Subject: [PATCH 22/23] paste temp source files into CrsGraph; remove temp source files --- .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 317 +++++++++++++++++- .../core/src/graphCopyAndPermuteNew.hpp | 315 ----------------- packages/tpetra/core/src/inner.hpp | 135 -------- 3 files changed, 316 insertions(+), 451 deletions(-) delete mode 100644 packages/tpetra/core/src/graphCopyAndPermuteNew.hpp delete mode 100644 packages/tpetra/core/src/inner.hpp diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 820aaf456dc8..c11dae4dfb6e 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -7669,7 +7669,322 @@ namespace Tpetra { return output; } -#include "graphCopyAndPermuteNew.hpp" + template + void + CrsGraph:: + insertGlobalIndicesDevice(const CrsGraph& srcCrsGraph, + CrsGraph& tgtCrsGraph, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + LocalOrdinal loopEnd) + { + using crs_graph_type = CrsGraph; + using LO = LocalOrdinal; + using GO = GlobalOrdinal; + typedef typename crs_graph_type::global_inds_device_view_type::non_const_value_type global_inds_device_value_t; + typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; + typedef typename Node::execution_space exec_space; + typedef Kokkos::RangePolicy range_type; + + const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); + const GlobalOrdinal GINV = Teuchos::OrdinalTraits::invalid (); + + const k_local_graph_device_type & srcGraphDevice = srcCrsGraph.getLocalGraphDevice(); + const k_local_graph_device_type & tgtGraphDevice = tgtCrsGraph.getLocalGraphDevice(); + + using local_map_type = typename crs_graph_type::map_type::local_map_type; + local_map_type srcRowMapLocal = srcCrsGraph.getRowMap()->getLocalMap(); + local_map_type srcColMapLocal = srcCrsGraph.getColMap()->getLocalMap(); + local_map_type tgtRowMapLocal = tgtCrsGraph.getRowMap()->getLocalMap(); + + auto tgtLocalRowPtrsDevice = tgtCrsGraph.getRowPtrsUnpackedDevice(); + auto tgtGlobalColInds = tgtCrsGraph.gblInds_wdv.getDeviceView(Access::ReadWrite); + auto srcLocalRowPtrsDevice = srcCrsGraph.getLocalRowPtrsDevice(); + auto srcLocalColIndsDevice = srcCrsGraph.lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly); + + typename crs_graph_type::num_row_entries_type::non_const_type h_numRowEnt = tgtCrsGraph.k_numRowEntries_; + + auto k_numRowEnt = Kokkos::create_mirror_view_and_copy (device_type (), h_numRowEnt); + + const bool sorted = false; + + bool hasMap = permuteFromLIDs.extent(0) > 0; + auto permuteToLIDs_d = permuteToLIDs.view_device (); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + +#ifdef PANZER_DO_CHECK_INNER_HPP +#undef PANZER_DO_CHECK_INNER_HPP +#endif +#define PANZER_DO_CHECK_INNER_HPP 0 +#if PANZER_DO_CHECK_INNER_HPP +#define CHECK(a,i) do { \ + if ((int)(i) >= (int)a.extent(0)) { \ + printf("ERROR: i= %d a= %s e= %d", (int)(i), #a, (int)a.extent(0)); \ + Kokkos::abort("inding error"); \ + } } while(0) +#else +#define CHECK(a,i) do { } while(0) +#endif + +#ifdef PANZER_INNER_ABORT +#undef PANZER_INNER_ABORT +#endif + +#define PANZER_INNER_ABORT(lin) do { \ + printf("ERROR: line= %d", lin); \ + Kokkos::abort("error"); \ + } while(0) + + Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", + range_type (0, loopEnd), + KOKKOS_LAMBDA(const LO sourceLID) + { + auto srcLid = sourceLID; + auto tgtLid = sourceLID; + if (hasMap) { + srcLid = permuteFromLIDs_d(srcLid); + tgtLid = permuteToLIDs_d(tgtLid); + } + auto srcGid = srcRowMapLocal.getGlobalElement(srcLid); + if (srcGid == GINV) PANZER_INNER_ABORT(__LINE__); + auto tgtGid = tgtRowMapLocal.getGlobalElement(tgtLid); + + auto tgtLocalRow = tgtRowMapLocal.getLocalElement(tgtGid); + if (tgtLocalRow == LINV) PANZER_INNER_ABORT(__LINE__); + if (tgtLocalRow != tgtLid) PANZER_INNER_ABORT(__LINE__); + CHECK(k_numRowEnt, tgtLocalRow); + auto tgtNumEntries = k_numRowEnt(tgtLocalRow); + + // FIXME no auto use + CHECK(srcLocalRowPtrsDevice, srcLid); + auto start = srcLocalRowPtrsDevice(srcLid); + CHECK(srcLocalRowPtrsDevice, srcLid + 1); + auto end = srcLocalRowPtrsDevice(srcLid + 1); + auto rowLength = (end - start); + + //KOKKOS_ASSERT(rowLength <= max_row_entries); + + CHECK(tgtLocalRowPtrsDevice, tgtLocalRow); + auto tstart = tgtLocalRowPtrsDevice(tgtLocalRow); + auto tend = tstart + tgtNumEntries; + CHECK(tgtLocalRowPtrsDevice, tgtLocalRow + 1); + auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); + + const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; + size_t num_inserted = 0; + + CHECK(tgtGlobalColInds, tstart); + global_inds_device_value_t *tgtGlobalColIndsPtr = tgtGlobalColInds.data(); + + size_t hint=0; + for (size_t j = 0; j < rowLength; j++) { + CHECK(srcLocalColIndsDevice, start + j); + auto ci = srcLocalColIndsDevice(start + j); + GO gi = srcColMapLocal.getGlobalElement(ci); + if (gi == GINV) PANZER_INNER_ABORT(__LINE__); + auto numInTgtRow = (tend - tstart); + + const size_t offset = + KokkosSparse::findRelOffset (tgtGlobalColIndsPtr+tstart, + numInTgtRow, + gi, hint, sorted); + + if (offset == numInTgtRow) { + if (num_inserted >= num_avail) { // not enough room + //return Teuchos::OrdinalTraits::invalid(); + Kokkos::abort("num_avail"); + } + //Kokkos::atomic_store (&tgtRowVals[offset], newVals); + CHECK(tgtGlobalColInds, tstart + offset); + tgtGlobalColIndsPtr[tstart + offset] = gi; + ++tend; + hint = offset + 1; + ++num_inserted; + } + } + CHECK(k_numRowEnt, tgtLocalRow); + k_numRowEnt(tgtLocalRow) += num_inserted; + + return size_t(0); + }); + + Kokkos::fence("here 10"); + Kokkos::deep_copy(tgtCrsGraph.k_numRowEntries_, k_numRowEnt); + tgtCrsGraph.setLocallyModified(); + } + + + template + void + CrsGraph:: + copyAndPermuteNew(const row_graph_type& srcRowGraph, + row_graph_type& tgtRowGraph, + const size_t numSameIDs, + const Kokkos::DualView& permuteToLIDs, + const Kokkos::DualView& permuteFromLIDs, + const CombineMode CM) + { + using std::endl; + using LO = local_ordinal_type; + using GO = global_ordinal_type; + using this_CRS_type = CrsGraph; + const char tfecfFuncName[] = "copyAndPermuteNew: "; + const bool verbose = verbose_; + + Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermuteNew"); + INCL_EXP(double capTime = Teuchos::Time::wallTime()); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("CrsGraph", "copyAndPermuteNew"); + std::ostringstream os; + os << *prefix << endl; + std::cerr << os.str (); + } + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC + (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0), + std::runtime_error, "permuteToLIDs.extent(0) = " + << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = " + << permuteFromLIDs.extent (0) << "."); + + if (verbose) { + std::ostringstream os; + os << *prefix << "Compute padding" << endl; + std::cerr << os.str (); + } + + using crs_graph_type = CrsGraph; + const crs_graph_type *srcCrsGraphPtr = dynamic_cast(&srcRowGraph); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!srcCrsGraphPtr, std::runtime_error, + "error srcGraph type= " << typeid(srcRowGraph).name()); + const crs_graph_type& srcCrsGraph = *srcCrsGraphPtr; + + crs_graph_type *tgtCrsGraphPtr = dynamic_cast(&tgtRowGraph); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!srcCrsGraphPtr, std::runtime_error, + "error tgtGraph type= " << typeid(tgtRowGraph).name()); + + crs_graph_type& tgtCrsGraph = *tgtCrsGraphPtr; + + INCL_EXP(double padTime = Teuchos::Time::wallTime()); + auto padding = tgtCrsGraph.computeCrsPadding(srcRowGraph, numSameIDs, + permuteToLIDs, permuteFromLIDs, verbose); + + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_pad"].first += -padTime + Teuchos::Time::wallTime()); + INCL_EXP(double apadTime = Teuchos::Time::wallTime()); + tgtCrsGraph.applyCrsPadding(*padding, verbose); + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_apad"].first += -apadTime + Teuchos::Time::wallTime()); + + const map_type& srcRowMap = *(srcRowGraph.getRowMap()); + const map_type& tgtRowMap = *(tgtRowGraph.getRowMap()); + const bool src_filled = srcRowGraph.isFillComplete(); + nonconst_global_inds_host_view_type row_copy; + LO myid = 0; + + // + // "Copy" part of "copy and permute." + // + LO numSameIDs_as_LID = static_cast(numSameIDs); + using LidMapType = std::function ; + + if (src_filled || srcCrsGraphPtr == nullptr) { + if (verbose) { + std::ostringstream os; + os << *prefix << "src_filled || srcCrsGraph == nullptr" << endl; + std::cerr << os.str (); + } + // If the source graph is fill complete, we can't use view mode, + // because the data might be stored in a different format not + // compatible with the expectations of view mode. Also, if the + // source graph is not a CrsGraph, we can't use view mode, + // because RowGraph only provides copy mode access to the data. + INCL_EXP(double time = Teuchos::Time::wallTime()); +#if 0 + // std::cout << "here 3" << std::endl; + + for (size_t i = 0; i < numSameIDs; ++i, ++myid) { + const GO gid = srcRowMap.getGlobalElement (myid); + size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); + Kokkos::resize(row_copy,row_length); + size_t check_row_length = 0; + srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); + tgtCrsGraph.insertGlobalIndices (gid, row_length, row_copy.data()); + } + // std::cout << "here 4" << std::endl; + +#else + + Kokkos::DualView noPermute; + insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, + noPermute, noPermute, + numSameIDs_as_LID); + +#endif + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_1"].first += -time + Teuchos::Time::wallTime()); + } else { + INCL_EXP(double time = Teuchos::Time::wallTime()); + if (verbose) { + std::ostringstream os; + os << *prefix << "! src_filled && srcCrsGraph != nullptr" << endl; + std::cerr << os.str (); + } + for (size_t i = 0; i < numSameIDs; ++i, ++myid) { + const GO gid = srcRowMap.getGlobalElement (myid); + global_inds_host_view_type row; + srcCrsGraph.getGlobalRowView (gid, row); + tgtCrsGraph.insertGlobalIndices (gid, row.extent(0), row.data()); + } + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_2"].first += -time + Teuchos::Time::wallTime()); + } + + // + // "Permute" part of "copy and permute." + // + auto permuteToLIDs_h = permuteToLIDs.view_host (); + auto permuteFromLIDs_h = permuteFromLIDs.view_host (); + auto permuteToLIDs_d = permuteToLIDs.view_device (); + auto permuteFromLIDs_d = permuteFromLIDs.view_device (); + + if (src_filled || srcCrsGraphPtr == nullptr) { + INCL_EXP(double time = Teuchos::Time::wallTime()); +#if 0 + for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { + const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); + const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); + size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (srcgid); + Kokkos::resize(row_copy,row_length); + size_t check_row_length = 0; + srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); + tgtCrsGraph.insertGlobalIndices (mygid, row_length, row_copy.data()); + } +#else + insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, + permuteToLIDs, permuteFromLIDs, // note reversed arg order, tgt, then src + static_cast (permuteToLIDs_h.extent (0))); +#endif + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_3"].first += -time + Teuchos::Time::wallTime()); + } else { + INCL_EXP(double time = Teuchos::Time::wallTime()); + for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { + const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); + const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); + global_inds_host_view_type row; + srcCrsGraph.getGlobalRowView (srcgid, row); + tgtCrsGraph.insertGlobalIndices (mygid, row.extent(0), row.data()); + } + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_4"].first += -time + Teuchos::Time::wallTime()); + } + + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + INCL_EXP(if (IN_EVAL_J) Timers["capsg_G"].first += -capTime + Teuchos::Time::wallTime()); + } + + } // namespace Tpetra diff --git a/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp b/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp deleted file mode 100644 index 23784f19cab3..000000000000 --- a/packages/tpetra/core/src/graphCopyAndPermuteNew.hpp +++ /dev/null @@ -1,315 +0,0 @@ -template -void -CrsGraph:: -insertGlobalIndicesDevice(const CrsGraph& srcCrsGraph, - CrsGraph& tgtCrsGraph, - const Kokkos::DualView& permuteToLIDs, - const Kokkos::DualView& permuteFromLIDs, - LocalOrdinal loopEnd) -{ - using crs_graph_type = CrsGraph; - using LO = LocalOrdinal; - using GO = GlobalOrdinal; - typedef typename crs_graph_type::global_inds_device_view_type::non_const_value_type global_inds_device_value_t; - typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; - typedef typename Node::execution_space exec_space; - typedef Kokkos::RangePolicy range_type; - - const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); - const GlobalOrdinal GINV = Teuchos::OrdinalTraits::invalid (); - - const k_local_graph_device_type & srcGraphDevice = srcCrsGraph.getLocalGraphDevice(); - const k_local_graph_device_type & tgtGraphDevice = tgtCrsGraph.getLocalGraphDevice(); - - using local_map_type = typename crs_graph_type::map_type::local_map_type; - local_map_type srcRowMapLocal = srcCrsGraph.getRowMap()->getLocalMap(); - local_map_type srcColMapLocal = srcCrsGraph.getColMap()->getLocalMap(); - local_map_type tgtRowMapLocal = tgtCrsGraph.getRowMap()->getLocalMap(); - - auto tgtLocalRowPtrsDevice = tgtCrsGraph.getRowPtrsUnpackedDevice(); - auto tgtGlobalColInds = tgtCrsGraph.gblInds_wdv.getDeviceView(Access::ReadWrite); - auto srcLocalRowPtrsDevice = srcCrsGraph.getLocalRowPtrsDevice(); - auto srcLocalColIndsDevice = srcCrsGraph.lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly); - - typename crs_graph_type::num_row_entries_type::non_const_type h_numRowEnt = tgtCrsGraph.k_numRowEntries_; - - auto k_numRowEnt = Kokkos::create_mirror_view_and_copy (device_type (), h_numRowEnt); - - const bool sorted = false; - - bool hasMap = permuteFromLIDs.extent(0) > 0; - auto permuteToLIDs_d = permuteToLIDs.view_device (); - auto permuteFromLIDs_d = permuteFromLIDs.view_device (); - -#ifdef PANZER_DO_CHECK_INNER_HPP -#undef PANZER_DO_CHECK_INNER_HPP -#endif -#define PANZER_DO_CHECK_INNER_HPP 0 -#if PANZER_DO_CHECK_INNER_HPP -#define CHECK(a,i) do { \ - if ((int)(i) >= (int)a.extent(0)) { \ - printf("ERROR: i= %d a= %s e= %d", (int)(i), #a, (int)a.extent(0)); \ - Kokkos::abort("inding error"); \ - } } while(0) -#else -#define CHECK(a,i) do { } while(0) -#endif - -#ifdef PANZER_INNER_ABORT -#undef PANZER_INNER_ABORT -#endif - -#define PANZER_INNER_ABORT(lin) do { \ - printf("ERROR: line= %d", lin); \ - Kokkos::abort("error"); \ - } while(0) - - Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", - range_type (0, loopEnd), - KOKKOS_LAMBDA(const LO sourceLID) - { - auto srcLid = sourceLID; - auto tgtLid = sourceLID; - if (hasMap) { - srcLid = permuteFromLIDs_d(srcLid); - tgtLid = permuteToLIDs_d(tgtLid); - } - auto srcGid = srcRowMapLocal.getGlobalElement(srcLid); - if (srcGid == GINV) PANZER_INNER_ABORT(__LINE__); - auto tgtGid = tgtRowMapLocal.getGlobalElement(tgtLid); - - auto tgtLocalRow = tgtRowMapLocal.getLocalElement(tgtGid); - if (tgtLocalRow == LINV) PANZER_INNER_ABORT(__LINE__); - if (tgtLocalRow != tgtLid) PANZER_INNER_ABORT(__LINE__); - CHECK(k_numRowEnt, tgtLocalRow); - auto tgtNumEntries = k_numRowEnt(tgtLocalRow); - - // FIXME no auto use - CHECK(srcLocalRowPtrsDevice, srcLid); - auto start = srcLocalRowPtrsDevice(srcLid); - CHECK(srcLocalRowPtrsDevice, srcLid + 1); - auto end = srcLocalRowPtrsDevice(srcLid + 1); - auto rowLength = (end - start); - - //KOKKOS_ASSERT(rowLength <= max_row_entries); - - CHECK(tgtLocalRowPtrsDevice, tgtLocalRow); - auto tstart = tgtLocalRowPtrsDevice(tgtLocalRow); - auto tend = tstart + tgtNumEntries; - CHECK(tgtLocalRowPtrsDevice, tgtLocalRow + 1); - auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); - - const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; - size_t num_inserted = 0; - - CHECK(tgtGlobalColInds, tstart); - global_inds_device_value_t *tgtGlobalColIndsPtr = tgtGlobalColInds.data(); - - size_t hint=0; - for (size_t j = 0; j < rowLength; j++) { - CHECK(srcLocalColIndsDevice, start + j); - auto ci = srcLocalColIndsDevice(start + j); - GO gi = srcColMapLocal.getGlobalElement(ci); - if (gi == GINV) PANZER_INNER_ABORT(__LINE__); - auto numInTgtRow = (tend - tstart); - - const size_t offset = - KokkosSparse::findRelOffset (tgtGlobalColIndsPtr+tstart, - numInTgtRow, - gi, hint, sorted); - - if (offset == numInTgtRow) { - if (num_inserted >= num_avail) { // not enough room - //return Teuchos::OrdinalTraits::invalid(); - Kokkos::abort("num_avail"); - } - //Kokkos::atomic_store (&tgtRowVals[offset], newVals); - CHECK(tgtGlobalColInds, tstart + offset); - tgtGlobalColIndsPtr[tstart + offset] = gi; - ++tend; - hint = offset + 1; - ++num_inserted; - } - } - CHECK(k_numRowEnt, tgtLocalRow); - k_numRowEnt(tgtLocalRow) += num_inserted; - - return size_t(0); - }); - - Kokkos::fence("here 10"); - Kokkos::deep_copy(tgtCrsGraph.k_numRowEntries_, k_numRowEnt); - tgtCrsGraph.setLocallyModified(); -} - - -template -void -CrsGraph:: -copyAndPermuteNew(const row_graph_type& srcRowGraph, - row_graph_type& tgtRowGraph, - const size_t numSameIDs, - const Kokkos::DualView& permuteToLIDs, - const Kokkos::DualView& permuteFromLIDs, - const CombineMode CM) -{ - using std::endl; - using LO = local_ordinal_type; - using GO = global_ordinal_type; - using this_CRS_type = CrsGraph; - const char tfecfFuncName[] = "copyAndPermuteNew: "; - const bool verbose = verbose_; - - Details::ProfilingRegion regionCAP("Tpetra::CrsGraph::copyAndPermuteNew"); - INCL_EXP(double capTime = Teuchos::Time::wallTime()); - std::unique_ptr prefix; - if (verbose) { - prefix = this->createPrefix("CrsGraph", "copyAndPermuteNew"); - std::ostringstream os; - os << *prefix << endl; - std::cerr << os.str (); - } - - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC - (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0), - std::runtime_error, "permuteToLIDs.extent(0) = " - << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = " - << permuteFromLIDs.extent (0) << "."); - - if (verbose) { - std::ostringstream os; - os << *prefix << "Compute padding" << endl; - std::cerr << os.str (); - } - - using crs_graph_type = CrsGraph; - const crs_graph_type *srcCrsGraphPtr = dynamic_cast(&srcRowGraph); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!srcCrsGraphPtr, std::runtime_error, - "error srcGraph type= " << typeid(srcRowGraph).name()); - const crs_graph_type& srcCrsGraph = *srcCrsGraphPtr; - - crs_graph_type *tgtCrsGraphPtr = dynamic_cast(&tgtRowGraph); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(!srcCrsGraphPtr, std::runtime_error, - "error tgtGraph type= " << typeid(tgtRowGraph).name()); - - crs_graph_type& tgtCrsGraph = *tgtCrsGraphPtr; - - INCL_EXP(double padTime = Teuchos::Time::wallTime()); - auto padding = tgtCrsGraph.computeCrsPadding(srcRowGraph, numSameIDs, - permuteToLIDs, permuteFromLIDs, verbose); - - INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_pad"].first += -padTime + Teuchos::Time::wallTime()); - INCL_EXP(double apadTime = Teuchos::Time::wallTime()); - tgtCrsGraph.applyCrsPadding(*padding, verbose); - INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_apad"].first += -apadTime + Teuchos::Time::wallTime()); - - const map_type& srcRowMap = *(srcRowGraph.getRowMap()); - const map_type& tgtRowMap = *(tgtRowGraph.getRowMap()); - const bool src_filled = srcRowGraph.isFillComplete(); - nonconst_global_inds_host_view_type row_copy; - LO myid = 0; - - // - // "Copy" part of "copy and permute." - // - LO numSameIDs_as_LID = static_cast(numSameIDs); - using LidMapType = std::function ; - - if (src_filled || srcCrsGraphPtr == nullptr) { - if (verbose) { - std::ostringstream os; - os << *prefix << "src_filled || srcCrsGraph == nullptr" << endl; - std::cerr << os.str (); - } - // If the source graph is fill complete, we can't use view mode, - // because the data might be stored in a different format not - // compatible with the expectations of view mode. Also, if the - // source graph is not a CrsGraph, we can't use view mode, - // because RowGraph only provides copy mode access to the data. - INCL_EXP(double time = Teuchos::Time::wallTime()); -#if 0 - // std::cout << "here 3" << std::endl; - - for (size_t i = 0; i < numSameIDs; ++i, ++myid) { - const GO gid = srcRowMap.getGlobalElement (myid); - size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); - Kokkos::resize(row_copy,row_length); - size_t check_row_length = 0; - srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); - tgtCrsGraph.insertGlobalIndices (gid, row_length, row_copy.data()); - } - // std::cout << "here 4" << std::endl; - -#else - - Kokkos::DualView noPermute; - insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, - noPermute, noPermute, - numSameIDs_as_LID); - -#endif - INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_1"].first += -time + Teuchos::Time::wallTime()); - } else { - INCL_EXP(double time = Teuchos::Time::wallTime()); - if (verbose) { - std::ostringstream os; - os << *prefix << "! src_filled && srcCrsGraph != nullptr" << endl; - std::cerr << os.str (); - } - for (size_t i = 0; i < numSameIDs; ++i, ++myid) { - const GO gid = srcRowMap.getGlobalElement (myid); - global_inds_host_view_type row; - srcCrsGraph.getGlobalRowView (gid, row); - tgtCrsGraph.insertGlobalIndices (gid, row.extent(0), row.data()); - } - INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_2"].first += -time + Teuchos::Time::wallTime()); - } - - // - // "Permute" part of "copy and permute." - // - auto permuteToLIDs_h = permuteToLIDs.view_host (); - auto permuteFromLIDs_h = permuteFromLIDs.view_host (); - auto permuteToLIDs_d = permuteToLIDs.view_device (); - auto permuteFromLIDs_d = permuteFromLIDs.view_device (); - - if (src_filled || srcCrsGraphPtr == nullptr) { - INCL_EXP(double time = Teuchos::Time::wallTime()); -#if 0 - for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { - const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); - const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); - size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (srcgid); - Kokkos::resize(row_copy,row_length); - size_t check_row_length = 0; - srcRowGraph.getGlobalRowCopy (srcgid, row_copy, check_row_length); - tgtCrsGraph.insertGlobalIndices (mygid, row_length, row_copy.data()); - } -#else - insertGlobalIndicesDevice(srcCrsGraph, tgtCrsGraph, - permuteToLIDs, permuteFromLIDs, // note reversed arg order, tgt, then src - static_cast (permuteToLIDs_h.extent (0))); -#endif - INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_3"].first += -time + Teuchos::Time::wallTime()); - } else { - INCL_EXP(double time = Teuchos::Time::wallTime()); - for (LO i = 0; i < static_cast (permuteToLIDs_h.extent (0)); ++i) { - const GO mygid = tgtRowMap.getGlobalElement (permuteToLIDs_h[i]); - const GO srcgid = srcRowMap.getGlobalElement (permuteFromLIDs_h[i]); - global_inds_host_view_type row; - srcCrsGraph.getGlobalRowView (srcgid, row); - tgtCrsGraph.insertGlobalIndices (mygid, row.extent(0), row.data()); - } - INCL_EXP(if (IN_EVAL_J) Timers["capsg_G_4"].first += -time + Teuchos::Time::wallTime()); - } - - if (verbose) { - std::ostringstream os; - os << *prefix << "Done" << endl; - std::cerr << os.str (); - } - INCL_EXP(if (IN_EVAL_J) Timers["capsg_G"].first += -capTime + Teuchos::Time::wallTime()); -} - diff --git a/packages/tpetra/core/src/inner.hpp b/packages/tpetra/core/src/inner.hpp deleted file mode 100644 index 808bdf3ac132..000000000000 --- a/packages/tpetra/core/src/inner.hpp +++ /dev/null @@ -1,135 +0,0 @@ -// const GO gid = srcRowMapLocal.getGlobalElement (sourceLID); -// size_t row_length = srcRowGraph.getNumEntriesInGlobalRow (gid); -// Kokkos::resize(row_copy,row_length); -// size_t check_row_length = 0; -// srcRowGraph.getGlobalRowCopy (gid, row_copy, check_row_length); -// tgtCrsGraph.insertGlobalIndices (gid, row_length, row_copy.data()); - -using global_inds_device_value_t = GlobalOrdinal; -using row_ptrs_device_value_t = size_t; -typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; -typedef typename Node::execution_space exec_space; -typedef Kokkos::RangePolicy range_type; - -const LocalOrdinal LINV = Teuchos::OrdinalTraits::invalid (); -const GlobalOrdinal GINV = Teuchos::OrdinalTraits::invalid (); - -typedef typename crs_graph_type::local_graph_device_type k_local_graph_device_type; -typedef typename Node::execution_space exec_space; -typedef Kokkos::RangePolicy range_type; -typedef typename Kokkos::GraphRowViewConst graph_row_view_const_type; - -const k_local_graph_device_type & srcGraphDevice = srcCrsGraph.getLocalGraphDevice(); -const k_local_graph_device_type & tgtGraphDevice = tgtCrsGraph.getLocalGraphDevice(); - -using local_map_type = typename crs_graph_type::map_type::local_map_type; -local_map_type srcRowMapLocal = srcCrsGraph.getRowMap()->getLocalMap(); -local_map_type srcColMapLocal = srcCrsGraph.getColMap()->getLocalMap(); -local_map_type tgtRowMapLocal = tgtCrsGraph.getRowMap()->getLocalMap(); - -auto tgtLocalRowPtrsDevice = tgtCrsGraph.getRowPtrsUnpackedDevice(); -auto tgtGlobalColInds = tgtCrsGraph.gblInds_wdv.getDeviceView(Access::ReadWrite); -auto srcLocalRowPtrsDevice = srcCrsGraph.getLocalRowPtrsDevice(); -auto srcLocalColIndsDevice = srcCrsGraph.lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly); - -typedef typename Node::execution_space exec_space; -typedef Kokkos::RangePolicy range_type; -typename num_row_entries_type::non_const_type h_numRowEnt = tgtCrsGraph.k_numRowEntries_; - -auto k_numRowEnt = Kokkos::create_mirror_view_and_copy (device_type (), h_numRowEnt); -LO numSameIDs_as_LID = static_cast(numSameIDs); -const bool sorted = false; - -#ifdef PANZER_DO_CHECK_INNER_HPP -#undef PANZER_DO_CHECK_INNER_HPP -#endif -#define PANZER_DO_CHECK_INNER_HPP 1 -#if PANZER_DO_CHECK_INNER_HPP -#define CHECK(a,i) do { \ - if ((int)(i) >= (int)a.extent(0)) { \ - char buf[100]; \ - sprintf(buf,"ERROR: i= %d a= %s e= %d", (int)(i), #a, (int)a.extent(0)); \ - Kokkos::abort(buf); \ - } } while(0) -#else -#define CHECK(a,i) do { } while(0) -#endif - -#ifdef PANZER_INNER_ABORT -#undef PANZER_INNER_ABORT -#endif - -#define PANZER_INNER_ABORT(lin) do { \ - char buf[100]; \ - sprintf(buf,"ERROR: line= %d", lin); \ - Kokkos::abort(buf); \ - } while(0) - -Kokkos::parallel_for("Tpetra_CrsGraph::copyAndPermuteNew2", - range_type (0, numSameIDs_as_LID), - KOKKOS_LAMBDA(const LO sourceLID) - { - auto srcGid = srcRowMapLocal.getGlobalElement(sourceLID); - if (srcGid == GINV) PANZER_INNER_ABORT(__LINE__); - auto tgtLocalRow = tgtRowMapLocal.getLocalElement(srcGid); - if (tgtLocalRow == LINV) PANZER_INNER_ABORT(__LINE__); - - CHECK(k_numRowEnt, tgtLocalRow); - auto tgtNumEntries = k_numRowEnt(tgtLocalRow); - - // FIXME no auto use - CHECK(srcLocalRowPtrsDevice, sourceLID); - auto start = srcLocalRowPtrsDevice(sourceLID); - CHECK(srcLocalRowPtrsDevice, sourceLID+1); - auto end = srcLocalRowPtrsDevice(sourceLID+1); - auto rowLength = (end - start); - - //KOKKOS_ASSERT(rowLength <= max_row_entries); - - CHECK(tgtLocalRowPtrsDevice, tgtLocalRow); - auto tstart = tgtLocalRowPtrsDevice(tgtLocalRow); - auto tend = tstart + tgtNumEntries; - CHECK(tgtLocalRowPtrsDevice, tgtLocalRow + 1); - auto tend1 = tgtLocalRowPtrsDevice(tgtLocalRow + 1); - - const size_t num_avail = (tend1 < tend) ? size_t (0) : tend1 - tend; - size_t num_inserted = 0; - - CHECK(tgtGlobalColInds, tstart); - global_inds_device_value_t *tgtGlobalColIndsPtr = tgtGlobalColInds.data(); - - size_t hint=0; - for (size_t j = 0; j < rowLength; j++) { - CHECK(srcLocalColIndsDevice, start + j); - auto ci = srcLocalColIndsDevice(start + j); - GO gi = srcColMapLocal.getGlobalElement(ci); - if (gi == GINV) PANZER_INNER_ABORT(__LINE__); - auto numInTgtRow = (tend - tstart); - - const size_t offset = - KokkosSparse::findRelOffset (tgtGlobalColIndsPtr+tstart, - numInTgtRow, - gi, hint, sorted); - - if (offset == numInTgtRow) { - if (num_inserted >= num_avail) { // not enough room - //return Teuchos::OrdinalTraits::invalid(); - Kokkos::abort("num_avail"); - } - //Kokkos::atomic_store (&tgtRowVals[offset], newVals); - CHECK(tgtGlobalColInds, tstart + offset); - tgtGlobalColIndsPtr[tstart + offset] = gi; - ++tend; - hint = offset + 1; - ++num_inserted; - } - } - CHECK(k_numRowEnt, tgtLocalRow); - k_numRowEnt(tgtLocalRow) += num_inserted; - - return size_t(0); - }); - -Kokkos::fence("here 10"); -Kokkos::deep_copy(tgtCrsGraph.k_numRowEntries_, k_numRowEnt); -tgtCrsGraph.setLocallyModified(); From 7561e1fabfc3f8a679d6e58fc0b77d357a861d01 Mon Sep 17 00:00:00 2001 From: srkenno Date: Mon, 16 Dec 2024 15:15:03 -0700 Subject: [PATCH 23/23] merge working branch --- packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp | 2 +- packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 2 +- packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index c11dae4dfb6e..6c4e61afd3fe 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -41,7 +41,7 @@ #include #include -#define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 #if EXP_INCLUDED_FROM_PANXER_MINI_EM #define INCL_EXP(a) a extern std::unordered_map> >& Timers; diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 1a79cc54c25a..51c09f043033 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -60,7 +60,7 @@ #include #include -#define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 #if EXP_INCLUDED_FROM_PANXER_MINI_EM #define INCL_EXP(a) a extern bool panzer_impl_new, panzer_impl_old; diff --git a/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp b/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp index 27dff1c605fe..5c9348c0ae97 100644 --- a/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_CrsPadding.hpp @@ -21,7 +21,7 @@ #include #include -#define EXP_INCLUDED_FROM_PANXER_MINI_EM 1 +#define EXP_INCLUDED_FROM_PANXER_MINI_EM 0 #if EXP_INCLUDED_FROM_PANXER_MINI_EM #define INCL_EXP(a) a extern std::unordered_map> >& Timers;