diff --git a/src/TiledArray/device/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h index 45f9b63731..dec80dcaf1 100644 --- a/src/TiledArray/device/btas_um_tensor.h +++ b/src/TiledArray/device/btas_um_tensor.h @@ -67,7 +67,7 @@ struct ArchiveLoadImpl> { TiledArray::btasUMTensorVarray &t) { TiledArray::Range range{}; TiledArray::device_um_btas_varray store{}; - ar &range &store; + ar & range & store; t = TiledArray::btasUMTensorVarray(std::move(range), std::move(store)); // device::setDevice(TiledArray::deviceEnv::instance()->default_device_id()); // auto &stream = device::stream_for(range); @@ -83,7 +83,7 @@ struct ArchiveStoreImpl> { auto stream = TiledArray::device::stream_for(t.range()); TiledArray::to_execution_space( t.storage(), stream); - ar &t.range() & t.storage(); + ar & t.range() & t.storage(); } }; @@ -674,25 +674,12 @@ template typename std::enable_if::value, TiledArray::DistArray>::type ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { - auto convert_tile_memcpy = [](const TATensor &tile) { - /// UMTensor must be wrapped into TA::Tile - - using Tensor = typename UMTensor::tensor_type; - - auto stream = device::stream_for(tile.range()); - typename Tensor::storage_type storage; - make_device_storage(storage, tile.range().area(), stream); - Tensor result(tile.range(), std::move(storage)); - - DeviceSafeCall( - device::memcpyAsync(result.data(), tile.data(), - tile.size() * sizeof(typename Tensor::value_type), - device::MemcpyDefault, stream)); - - device::sync_madness_task_with(stream); - return TiledArray::Tile(std::move(result)); - }; + using inT = typename TATensor::value_type; + using outT = typename UMTensor::value_type; + // check if element conversion is necessary + constexpr bool T_conversion = !std::is_same_v; + // this is safe even when need to convert element types, but less efficient auto convert_tile_um = [](const TATensor &tile) { /// UMTensor must be wrapped into TA::Tile @@ -711,14 +698,47 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { TiledArray::to_execution_space( result.storage(), stream); + // N.B. move! without it have D-to-H transfer due to calling UM + // allocator construct() on the host return TiledArray::Tile(std::move(result)); }; - const char *use_legacy_conversion = - std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION"); - auto um_array = use_legacy_conversion - ? to_new_tile_type(array, convert_tile_um) - : to_new_tile_type(array, convert_tile_memcpy); + TiledArray::DistArray um_array; + if constexpr (T_conversion) { + um_array = to_new_tile_type(array, convert_tile_um); + } else { + // this is more efficient for copying: + // - avoids copy on host followed by UM transfer, instead uses direct copy + // - replaced unneeded copy (which also caused D-to-H transfer due to + // calling UM allocator construct() on the host) by move + // This eliminates all spurious UM traffic in (T) W3 contractions + auto convert_tile_memcpy = [](const TATensor &tile) { + /// UMTensor must be wrapped into TA::Tile + + using Tensor = typename UMTensor::tensor_type; + + auto stream = device::stream_for(tile.range()); + typename Tensor::storage_type storage; + make_device_storage(storage, tile.range().area(), stream); + Tensor result(tile.range(), std::move(storage)); + + DeviceSafeCall( + device::memcpyAsync(result.data(), tile.data(), + tile.size() * sizeof(typename Tensor::value_type), + device::MemcpyDefault, stream)); + + device::sync_madness_task_with(stream); + // N.B. move! without it have D-to-H transfer due to calling UM + // allocator construct() on the host + return TiledArray::Tile(std::move(result)); + }; + + const char *use_legacy_conversion = + std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION"); + um_array = use_legacy_conversion + ? to_new_tile_type(array, convert_tile_um) + : to_new_tile_type(array, convert_tile_memcpy); + } array.world().gop.fence(); return um_array;