diff --git a/BENCHMARKING.md b/BENCHMARKING.md index 043cdd71cf7..883100d0f02 100644 --- a/BENCHMARKING.md +++ b/BENCHMARKING.md @@ -305,3 +305,5 @@ The supported environment variables are described in the following list: values as the right-hand side in solver benchmarks. Default is `unit`. * `DETAILED={0,1}` - selects whether detailed benchmarks should be ran for the solver benchmarks, can be either `0` (off) or `1` (on). The default is `0`. +* `GPU_TIMER={true, false}` - If set to `true`, use the gpu timer, which is + valid for cuda/hip executor, to measure the timing. Default is `false`. diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh index 24bea8efbe2..d8bb6c6a88b 100644 --- a/benchmark/run_all_benchmarks.sh +++ b/benchmark/run_all_benchmarks.sh @@ -149,7 +149,7 @@ run_conversion_benchmarks() { cp "$1" "$1.imd" # make sure we're not loosing the original input ./conversions/conversions --backup="$1.bkp" --double_buffer="$1.bkp2" \ --executor="${EXECUTOR}" --formats="${FORMATS}" \ - --device_id="${DEVICE_ID}" --gpu_time=${GPU_TIMER} \ + --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" } @@ -165,7 +165,7 @@ run_spmv_benchmarks() { cp "$1" "$1.imd" # make sure we're not loosing the original input ./spmv/spmv --backup="$1.bkp" --double_buffer="$1.bkp2" \ --executor="${EXECUTOR}" --formats="${FORMATS}" \ - --device_id="${DEVICE_ID}" --gpu_time=${GPU_TIMER} \ + --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" } @@ -183,7 +183,7 @@ run_solver_benchmarks() { --executor="${EXECUTOR}" --solvers="${SOLVERS}" \ --preconditioners="${PRECONDS}" \ --max_iters=${SOLVERS_MAX_ITERATIONS} --rel_res_goal=${SOLVERS_PRECISION} \ - ${SOLVERS_RHS_FLAG} ${DETAILED_STR} --device_id="${DEVICE_ID}" --gpu_time=${GPU_TIMER} \ + ${SOLVERS_RHS_FLAG} ${DETAILED_STR} --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" } @@ -209,7 +209,7 @@ run_preconditioner_benchmarks() { --executor="${EXECUTOR}" --preconditioners="jacobi" \ --jacobi_max_block_size="${bsize}" \ --jacobi_storage="${prec}" \ - --device_id="${DEVICE_ID}" --gpu_time=${GPU_TIMER} \ + --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" done diff --git a/benchmark/utils/timer.hpp b/benchmark/utils/timer.hpp index ffd39919337..f4edcd15c1a 100644 --- a/benchmark/utils/timer.hpp +++ b/benchmark/utils/timer.hpp @@ -62,16 +62,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/device_guard.hip.hpp" -#endif // HAS_CUDA +#endif // HAS_HIP +// Command-line arguments DEFINE_bool(gpu_timer, false, "use gpu timer based on event. It is valid only when " "executor is cuda or hip"); +/** + * Timer stores the timing information + */ class Timer { public: + /** + * Starts the timer + */ void tic() { assert(tic_called_ == false); @@ -79,25 +86,65 @@ class Timer { tic_called_ = true; } - std::size_t toc() + /** + * Finishs the timer + */ + void toc() { assert(tic_called_ == true); auto ns = this->toc_impl(); tic_called_ = false; this->add_record(ns); - return ns; } - std::size_t get_total_time() { return total_duration_ns_; } - - std::size_t get_tictoc_num() { return duration_ns_.size(); } - - double get_average_time() + /** + * Get the summation of each time in nanoseconds. + * + * @return the nanoseconds of total time + */ + std::int64_t get_total_time() const { return total_duration_ns_; } + + /** + * Get the number of repetitions. + * + * @return the number of repetitions + */ + std::int64_t get_num_repetitions() const { return duration_ns_.size(); } + + /** + * Get the average time of repetitions in nanoseconds + * + * @return the average time in nanoseconds + */ + double get_average_time() const { return static_cast(this->get_total_time()) / - this->get_tictoc_num(); + this->get_num_repetitions(); + } + + /** + * Get the vector containing the each time results in nanoseconds. + * + * @return the vector of each time results in nanoseconds + */ + std::vector get_time_detail() const { return duration_ns_; } + + /** + * Get the latest result in nanoseconds. If there is not result yet, return 0. + * + * @return the latest result in nanoseconds + */ + std::int64_t get_latest_time() const { + if (duration_ns_.size() >= 1) { + return duration_ns_.back(); + } else { + return 0; + } } + /** + * Clear the results of timer + */ void clear() { duration_ns_.clear(); @@ -105,29 +152,52 @@ class Timer { total_duration_ns_ = 0; } + /** + * Create a timer + */ Timer() : tic_called_(false), total_duration_ns_(0) {} protected: - void add_record(std::size_t ns) + /** + * Put the nanosecond result into vector + */ + void add_record(std::int64_t ns) { // add the result; duration_ns_.emplace_back(ns); total_duration_ns_ += ns; } + /** + * The implementation of tic. + */ virtual void tic_impl() = 0; - virtual std::size_t toc_impl() = 0; + /** + * The implementation of toc. Return the nanoseconds result. + * + * @return the nanoseconds result + */ + virtual std::int64_t toc_impl() = 0; private: - std::vector duration_ns_; + std::vector duration_ns_; bool tic_called_; - std::size_t total_duration_ns_; + std::int64_t total_duration_ns_; }; +/** + * CpuTimer use synchronize of executor and std::chrono to measure the + * timing. + */ class CpuTimer : public Timer { public: + /** + * Create a CpuTimer + * + * @param exec Executor associated to the timer + */ CpuTimer(std::shared_ptr exec) : Timer(), exec_(exec) {} @@ -138,14 +208,14 @@ class CpuTimer : public Timer { start_ = std::chrono::steady_clock::now(); } - std::size_t toc_impl() override + std::int64_t toc_impl() override { exec_->synchronize(); auto stop = std::chrono::steady_clock::now(); auto duration_time = std::chrono::duration_cast(stop - start_) .count(); - return static_cast(duration_time); + return duration_time; } private: @@ -157,12 +227,25 @@ class CpuTimer : public Timer { #ifdef HAS_CUDA +/** + * CudaTimer uses cuda executor and cudaEvent to measure the timing. + */ class CudaTimer : public Timer { public: + /** + * Create a CudaTimer. + * + * @param exec Executor which is CudaExecutor indeed + */ CudaTimer(std::shared_ptr exec) : CudaTimer(std::dynamic_pointer_cast(exec)) {} + /** + * Create a CudaTimer. + * + * @param exec CudaExecutor associated to the timer + */ CudaTimer(std::shared_ptr exec) : Timer() { assert(exec != nullptr); @@ -181,7 +264,7 @@ class CudaTimer : public Timer { GKO_ASSERT_NO_CUDA_ERRORS(cudaEventRecord(start_)); } - std::size_t toc_impl() override + std::int64_t toc_impl() override { gko::cuda::device_guard g{id_}; // Currently, gko::CudaExecutor always use default stream. @@ -192,7 +275,7 @@ class CudaTimer : public Timer { // resolution of around 0.5 microseconds GKO_ASSERT_NO_CUDA_ERRORS( cudaEventElapsedTime(&duration_time, start_, stop_)); - return static_cast(duration_time * 1e6); + return static_cast(duration_time * 1e6); } private: @@ -209,12 +292,25 @@ class CudaTimer : public Timer { #ifdef HAS_HIP +/** + * HipTimer uses hip executor and hipEvent to measure the timing. + */ class HipTimer : public Timer { public: + /** + * Create a HipTimer. + * + * @param exec Executor which is HipExecutor indeed + */ HipTimer(std::shared_ptr exec) : HipTimer(std::dynamic_pointer_cast(exec)) {} + /** + * Create a HipTimer. + * + * @param exec HipExecutor associated to the timer + */ HipTimer(std::shared_ptr exec) : Timer() { assert(exec != nullptr); @@ -233,7 +329,7 @@ class HipTimer : public Timer { GKO_ASSERT_NO_HIP_ERRORS(hipEventRecord(start_)); } - std::size_t toc_impl() override + std::int64_t toc_impl() override { gko::hip::device_guard g{id_}; // Currently, gko::HipExecutor always use default stream. @@ -244,7 +340,7 @@ class HipTimer : public Timer { // resolution of around 0.5 microseconds GKO_ASSERT_NO_HIP_ERRORS( hipEventElapsedTime(&duration_time, start_, stop_)); - return static_cast(duration_time * 1e6); + return static_cast(duration_time * 1e6); } private: @@ -258,6 +354,13 @@ class HipTimer : public Timer { #endif // HAS_HIP +/** + * Get the timer. If the executor does not support gpu timer, still return the + * cpu timer. + * + * @param exec Executor associated to the timer + * @param use_gpu_timer whether to use the gpu timer + */ std::shared_ptr get_timer(std::shared_ptr exec, bool use_gpu_timer) { @@ -276,6 +379,6 @@ std::shared_ptr get_timer(std::shared_ptr exec, } #endif // HAS_HIP } - // Not use gpu_timer or not cuda/hip executor + // No cuda/hip executor available or no gpu_timer used return std::make_shared(exec); }